1 /* Copyright 2008, Google Inc.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * curve25519-donna: Curve25519 elliptic curve, public key function
32 * http://code.google.com/p/curve25519-donna/
34 * Adam Langley <agl@imperialviolet.org>
36 * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
38 * More information about curve25519 can be found here
39 * http://cr.yp.to/ecdh.html
41 * djb's sample implementation of curve25519 is written in a special assembly
42 * language called qhasm and uses the floating point registers.
44 * This is, almost, a clean room reimplementation from the curve25519 paper. It
45 * uses many of the tricks described therein. Only the crecip function is taken
46 * from the sample implementation.
58 /* Field element representation:
60 * Field elements are written as an array of signed, 64-bit limbs, least
61 * significant first. The value of the field element is:
62 * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
64 * i.e. the limbs are 26, 25, 26, 25, ... bits wide.
67 /* Sum two numbers: output += in */
68 static void fsum(limb
*output
, const limb
*in
) {
70 for (i
= 0; i
< 10; i
+= 2) {
71 output
[0+i
] = (output
[0+i
] + in
[0+i
]);
72 output
[1+i
] = (output
[1+i
] + in
[1+i
]);
76 /* Find the difference of two numbers: output = in - output
77 * (note the order of the arguments!)
79 static void fdifference(limb
*output
, const limb
*in
) {
81 for (i
= 0; i
< 10; ++i
) {
82 output
[i
] = (in
[i
] - output
[i
]);
86 /* Multiply a number by a scalar: output = in * scalar */
87 static void fscalar_product(limb
*output
, const limb
*in
, const limb scalar
) {
89 for (i
= 0; i
< 10; ++i
) {
90 output
[i
] = in
[i
] * scalar
;
94 /* Multiply two numbers: output = in2 * in
96 * output must be distinct to both inputs. The inputs are reduced coefficient
97 * form, the output is not.
99 static void fproduct(limb
*output
, const limb
*in2
, const limb
*in
) {
100 output
[0] = ((limb
) ((s32
) in2
[0])) * ((s32
) in
[0]);
101 output
[1] = ((limb
) ((s32
) in2
[0])) * ((s32
) in
[1]) +
102 ((limb
) ((s32
) in2
[1])) * ((s32
) in
[0]);
103 output
[2] = 2 * ((limb
) ((s32
) in2
[1])) * ((s32
) in
[1]) +
104 ((limb
) ((s32
) in2
[0])) * ((s32
) in
[2]) +
105 ((limb
) ((s32
) in2
[2])) * ((s32
) in
[0]);
106 output
[3] = ((limb
) ((s32
) in2
[1])) * ((s32
) in
[2]) +
107 ((limb
) ((s32
) in2
[2])) * ((s32
) in
[1]) +
108 ((limb
) ((s32
) in2
[0])) * ((s32
) in
[3]) +
109 ((limb
) ((s32
) in2
[3])) * ((s32
) in
[0]);
110 output
[4] = ((limb
) ((s32
) in2
[2])) * ((s32
) in
[2]) +
111 2 * (((limb
) ((s32
) in2
[1])) * ((s32
) in
[3]) +
112 ((limb
) ((s32
) in2
[3])) * ((s32
) in
[1])) +
113 ((limb
) ((s32
) in2
[0])) * ((s32
) in
[4]) +
114 ((limb
) ((s32
) in2
[4])) * ((s32
) in
[0]);
115 output
[5] = ((limb
) ((s32
) in2
[2])) * ((s32
) in
[3]) +
116 ((limb
) ((s32
) in2
[3])) * ((s32
) in
[2]) +
117 ((limb
) ((s32
) in2
[1])) * ((s32
) in
[4]) +
118 ((limb
) ((s32
) in2
[4])) * ((s32
) in
[1]) +
119 ((limb
) ((s32
) in2
[0])) * ((s32
) in
[5]) +
120 ((limb
) ((s32
) in2
[5])) * ((s32
) in
[0]);
121 output
[6] = 2 * (((limb
) ((s32
) in2
[3])) * ((s32
) in
[3]) +
122 ((limb
) ((s32
) in2
[1])) * ((s32
) in
[5]) +
123 ((limb
) ((s32
) in2
[5])) * ((s32
) in
[1])) +
124 ((limb
) ((s32
) in2
[2])) * ((s32
) in
[4]) +
125 ((limb
) ((s32
) in2
[4])) * ((s32
) in
[2]) +
126 ((limb
) ((s32
) in2
[0])) * ((s32
) in
[6]) +
127 ((limb
) ((s32
) in2
[6])) * ((s32
) in
[0]);
128 output
[7] = ((limb
) ((s32
) in2
[3])) * ((s32
) in
[4]) +
129 ((limb
) ((s32
) in2
[4])) * ((s32
) in
[3]) +
130 ((limb
) ((s32
) in2
[2])) * ((s32
) in
[5]) +
131 ((limb
) ((s32
) in2
[5])) * ((s32
) in
[2]) +
132 ((limb
) ((s32
) in2
[1])) * ((s32
) in
[6]) +
133 ((limb
) ((s32
) in2
[6])) * ((s32
) in
[1]) +
134 ((limb
) ((s32
) in2
[0])) * ((s32
) in
[7]) +
135 ((limb
) ((s32
) in2
[7])) * ((s32
) in
[0]);
136 output
[8] = ((limb
) ((s32
) in2
[4])) * ((s32
) in
[4]) +
137 2 * (((limb
) ((s32
) in2
[3])) * ((s32
) in
[5]) +
138 ((limb
) ((s32
) in2
[5])) * ((s32
) in
[3]) +
139 ((limb
) ((s32
) in2
[1])) * ((s32
) in
[7]) +
140 ((limb
) ((s32
) in2
[7])) * ((s32
) in
[1])) +
141 ((limb
) ((s32
) in2
[2])) * ((s32
) in
[6]) +
142 ((limb
) ((s32
) in2
[6])) * ((s32
) in
[2]) +
143 ((limb
) ((s32
) in2
[0])) * ((s32
) in
[8]) +
144 ((limb
) ((s32
) in2
[8])) * ((s32
) in
[0]);
145 output
[9] = ((limb
) ((s32
) in2
[4])) * ((s32
) in
[5]) +
146 ((limb
) ((s32
) in2
[5])) * ((s32
) in
[4]) +
147 ((limb
) ((s32
) in2
[3])) * ((s32
) in
[6]) +
148 ((limb
) ((s32
) in2
[6])) * ((s32
) in
[3]) +
149 ((limb
) ((s32
) in2
[2])) * ((s32
) in
[7]) +
150 ((limb
) ((s32
) in2
[7])) * ((s32
) in
[2]) +
151 ((limb
) ((s32
) in2
[1])) * ((s32
) in
[8]) +
152 ((limb
) ((s32
) in2
[8])) * ((s32
) in
[1]) +
153 ((limb
) ((s32
) in2
[0])) * ((s32
) in
[9]) +
154 ((limb
) ((s32
) in2
[9])) * ((s32
) in
[0]);
155 output
[10] = 2 * (((limb
) ((s32
) in2
[5])) * ((s32
) in
[5]) +
156 ((limb
) ((s32
) in2
[3])) * ((s32
) in
[7]) +
157 ((limb
) ((s32
) in2
[7])) * ((s32
) in
[3]) +
158 ((limb
) ((s32
) in2
[1])) * ((s32
) in
[9]) +
159 ((limb
) ((s32
) in2
[9])) * ((s32
) in
[1])) +
160 ((limb
) ((s32
) in2
[4])) * ((s32
) in
[6]) +
161 ((limb
) ((s32
) in2
[6])) * ((s32
) in
[4]) +
162 ((limb
) ((s32
) in2
[2])) * ((s32
) in
[8]) +
163 ((limb
) ((s32
) in2
[8])) * ((s32
) in
[2]);
164 output
[11] = ((limb
) ((s32
) in2
[5])) * ((s32
) in
[6]) +
165 ((limb
) ((s32
) in2
[6])) * ((s32
) in
[5]) +
166 ((limb
) ((s32
) in2
[4])) * ((s32
) in
[7]) +
167 ((limb
) ((s32
) in2
[7])) * ((s32
) in
[4]) +
168 ((limb
) ((s32
) in2
[3])) * ((s32
) in
[8]) +
169 ((limb
) ((s32
) in2
[8])) * ((s32
) in
[3]) +
170 ((limb
) ((s32
) in2
[2])) * ((s32
) in
[9]) +
171 ((limb
) ((s32
) in2
[9])) * ((s32
) in
[2]);
172 output
[12] = ((limb
) ((s32
) in2
[6])) * ((s32
) in
[6]) +
173 2 * (((limb
) ((s32
) in2
[5])) * ((s32
) in
[7]) +
174 ((limb
) ((s32
) in2
[7])) * ((s32
) in
[5]) +
175 ((limb
) ((s32
) in2
[3])) * ((s32
) in
[9]) +
176 ((limb
) ((s32
) in2
[9])) * ((s32
) in
[3])) +
177 ((limb
) ((s32
) in2
[4])) * ((s32
) in
[8]) +
178 ((limb
) ((s32
) in2
[8])) * ((s32
) in
[4]);
179 output
[13] = ((limb
) ((s32
) in2
[6])) * ((s32
) in
[7]) +
180 ((limb
) ((s32
) in2
[7])) * ((s32
) in
[6]) +
181 ((limb
) ((s32
) in2
[5])) * ((s32
) in
[8]) +
182 ((limb
) ((s32
) in2
[8])) * ((s32
) in
[5]) +
183 ((limb
) ((s32
) in2
[4])) * ((s32
) in
[9]) +
184 ((limb
) ((s32
) in2
[9])) * ((s32
) in
[4]);
185 output
[14] = 2 * (((limb
) ((s32
) in2
[7])) * ((s32
) in
[7]) +
186 ((limb
) ((s32
) in2
[5])) * ((s32
) in
[9]) +
187 ((limb
) ((s32
) in2
[9])) * ((s32
) in
[5])) +
188 ((limb
) ((s32
) in2
[6])) * ((s32
) in
[8]) +
189 ((limb
) ((s32
) in2
[8])) * ((s32
) in
[6]);
190 output
[15] = ((limb
) ((s32
) in2
[7])) * ((s32
) in
[8]) +
191 ((limb
) ((s32
) in2
[8])) * ((s32
) in
[7]) +
192 ((limb
) ((s32
) in2
[6])) * ((s32
) in
[9]) +
193 ((limb
) ((s32
) in2
[9])) * ((s32
) in
[6]);
194 output
[16] = ((limb
) ((s32
) in2
[8])) * ((s32
) in
[8]) +
195 2 * (((limb
) ((s32
) in2
[7])) * ((s32
) in
[9]) +
196 ((limb
) ((s32
) in2
[9])) * ((s32
) in
[7]));
197 output
[17] = ((limb
) ((s32
) in2
[8])) * ((s32
) in
[9]) +
198 ((limb
) ((s32
) in2
[9])) * ((s32
) in
[8]);
199 output
[18] = 2 * ((limb
) ((s32
) in2
[9])) * ((s32
) in
[9]);
202 /* Reduce a long form to a short form by taking the input mod 2^255 - 19. */
203 static void freduce_degree(limb
*output
) {
204 /* Each of these shifts and adds ends up multiplying the value by 19. */
205 output
[8] += output
[18] << 4;
206 output
[8] += output
[18] << 1;
207 output
[8] += output
[18];
208 output
[7] += output
[17] << 4;
209 output
[7] += output
[17] << 1;
210 output
[7] += output
[17];
211 output
[6] += output
[16] << 4;
212 output
[6] += output
[16] << 1;
213 output
[6] += output
[16];
214 output
[5] += output
[15] << 4;
215 output
[5] += output
[15] << 1;
216 output
[5] += output
[15];
217 output
[4] += output
[14] << 4;
218 output
[4] += output
[14] << 1;
219 output
[4] += output
[14];
220 output
[3] += output
[13] << 4;
221 output
[3] += output
[13] << 1;
222 output
[3] += output
[13];
223 output
[2] += output
[12] << 4;
224 output
[2] += output
[12] << 1;
225 output
[2] += output
[12];
226 output
[1] += output
[11] << 4;
227 output
[1] += output
[11] << 1;
228 output
[1] += output
[11];
229 output
[0] += output
[10] << 4;
230 output
[0] += output
[10] << 1;
231 output
[0] += output
[10];
235 #error "This code only works on a two's complement system"
238 /* return v / 2^26, using only shifts and adds. */
240 div_by_2_26(const limb v
)
242 /* High word of v; no shift needed*/
243 const uint32_t highword
= (uint32_t) (((uint64_t) v
) >> 32);
244 /* Set to all 1s if v was negative; else set to 0s. */
245 const int32_t sign
= ((int32_t) highword
) >> 31;
246 /* Set to 0x3ffffff if v was negative; else set to 0. */
247 const int32_t roundoff
= ((uint32_t) sign
) >> 6;
248 /* Should return v / (1<<26) */
249 return (v
+ roundoff
) >> 26;
252 /* return v / (2^25), using only shifts and adds. */
254 div_by_2_25(const limb v
)
256 /* High word of v; no shift needed*/
257 const uint32_t highword
= (uint32_t) (((uint64_t) v
) >> 32);
258 /* Set to all 1s if v was negative; else set to 0s. */
259 const int32_t sign
= ((int32_t) highword
) >> 31;
260 /* Set to 0x1ffffff if v was negative; else set to 0. */
261 const int32_t roundoff
= ((uint32_t) sign
) >> 7;
262 /* Should return v / (1<<25) */
263 return (v
+ roundoff
) >> 25;
267 div_s32_by_2_25(const s32 v
)
269 const s32 roundoff
= ((uint32_t)(v
>> 31)) >> 7;
270 return (v
+ roundoff
) >> 25;
273 /* Reduce all coefficients of the short form input so that |x| < 2^26.
275 * On entry: |output[i]| < 2^62
277 static void freduce_coefficients(limb
*output
) {
282 for (i
= 0; i
< 10; i
+= 2) {
283 limb over
= div_by_2_26(output
[i
]);
284 output
[i
] -= over
<< 26;
287 over
= div_by_2_25(output
[i
+1]);
288 output
[i
+1] -= over
<< 25;
291 /* Now |output[10]| < 2 ^ 38 and all other coefficients are reduced. */
292 output
[0] += output
[10] << 4;
293 output
[0] += output
[10] << 1;
294 output
[0] += output
[10];
298 /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19 * 2^38
299 * So |over| will be no more than 77825 */
301 limb over
= div_by_2_26(output
[0]);
302 output
[0] -= over
<< 26;
306 /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 77825
307 * So |over| will be no more than 1. */
309 /* output[1] fits in 32 bits, so we can use div_s32_by_2_25 here. */
310 s32 over32
= div_s32_by_2_25((s32
) output
[1]);
311 output
[1] -= over32
<< 25;
315 /* Finally, output[0,1,3..9] are reduced, and output[2] is "nearly reduced":
316 * we have |output[2]| <= 2^26. This is good enough for all of our math,
317 * but it will require an extra freduce_coefficients before fcontract. */
320 /* A helpful wrapper around fproduct: output = in * in2.
322 * output must be distinct to both inputs. The output is reduced degree and
323 * reduced coefficient.
326 fmul(limb
*output
, const limb
*in
, const limb
*in2
) {
328 fproduct(t
, in
, in2
);
330 freduce_coefficients(t
);
331 memcpy(output
, t
, sizeof(limb
) * 10);
334 static void fsquare_inner(limb
*output
, const limb
*in
) {
335 output
[0] = ((limb
) ((s32
) in
[0])) * ((s32
) in
[0]);
336 output
[1] = 2 * ((limb
) ((s32
) in
[0])) * ((s32
) in
[1]);
337 output
[2] = 2 * (((limb
) ((s32
) in
[1])) * ((s32
) in
[1]) +
338 ((limb
) ((s32
) in
[0])) * ((s32
) in
[2]));
339 output
[3] = 2 * (((limb
) ((s32
) in
[1])) * ((s32
) in
[2]) +
340 ((limb
) ((s32
) in
[0])) * ((s32
) in
[3]));
341 output
[4] = ((limb
) ((s32
) in
[2])) * ((s32
) in
[2]) +
342 4 * ((limb
) ((s32
) in
[1])) * ((s32
) in
[3]) +
343 2 * ((limb
) ((s32
) in
[0])) * ((s32
) in
[4]);
344 output
[5] = 2 * (((limb
) ((s32
) in
[2])) * ((s32
) in
[3]) +
345 ((limb
) ((s32
) in
[1])) * ((s32
) in
[4]) +
346 ((limb
) ((s32
) in
[0])) * ((s32
) in
[5]));
347 output
[6] = 2 * (((limb
) ((s32
) in
[3])) * ((s32
) in
[3]) +
348 ((limb
) ((s32
) in
[2])) * ((s32
) in
[4]) +
349 ((limb
) ((s32
) in
[0])) * ((s32
) in
[6]) +
350 2 * ((limb
) ((s32
) in
[1])) * ((s32
) in
[5]));
351 output
[7] = 2 * (((limb
) ((s32
) in
[3])) * ((s32
) in
[4]) +
352 ((limb
) ((s32
) in
[2])) * ((s32
) in
[5]) +
353 ((limb
) ((s32
) in
[1])) * ((s32
) in
[6]) +
354 ((limb
) ((s32
) in
[0])) * ((s32
) in
[7]));
355 output
[8] = ((limb
) ((s32
) in
[4])) * ((s32
) in
[4]) +
356 2 * (((limb
) ((s32
) in
[2])) * ((s32
) in
[6]) +
357 ((limb
) ((s32
) in
[0])) * ((s32
) in
[8]) +
358 2 * (((limb
) ((s32
) in
[1])) * ((s32
) in
[7]) +
359 ((limb
) ((s32
) in
[3])) * ((s32
) in
[5])));
360 output
[9] = 2 * (((limb
) ((s32
) in
[4])) * ((s32
) in
[5]) +
361 ((limb
) ((s32
) in
[3])) * ((s32
) in
[6]) +
362 ((limb
) ((s32
) in
[2])) * ((s32
) in
[7]) +
363 ((limb
) ((s32
) in
[1])) * ((s32
) in
[8]) +
364 ((limb
) ((s32
) in
[0])) * ((s32
) in
[9]));
365 output
[10] = 2 * (((limb
) ((s32
) in
[5])) * ((s32
) in
[5]) +
366 ((limb
) ((s32
) in
[4])) * ((s32
) in
[6]) +
367 ((limb
) ((s32
) in
[2])) * ((s32
) in
[8]) +
368 2 * (((limb
) ((s32
) in
[3])) * ((s32
) in
[7]) +
369 ((limb
) ((s32
) in
[1])) * ((s32
) in
[9])));
370 output
[11] = 2 * (((limb
) ((s32
) in
[5])) * ((s32
) in
[6]) +
371 ((limb
) ((s32
) in
[4])) * ((s32
) in
[7]) +
372 ((limb
) ((s32
) in
[3])) * ((s32
) in
[8]) +
373 ((limb
) ((s32
) in
[2])) * ((s32
) in
[9]));
374 output
[12] = ((limb
) ((s32
) in
[6])) * ((s32
) in
[6]) +
375 2 * (((limb
) ((s32
) in
[4])) * ((s32
) in
[8]) +
376 2 * (((limb
) ((s32
) in
[5])) * ((s32
) in
[7]) +
377 ((limb
) ((s32
) in
[3])) * ((s32
) in
[9])));
378 output
[13] = 2 * (((limb
) ((s32
) in
[6])) * ((s32
) in
[7]) +
379 ((limb
) ((s32
) in
[5])) * ((s32
) in
[8]) +
380 ((limb
) ((s32
) in
[4])) * ((s32
) in
[9]));
381 output
[14] = 2 * (((limb
) ((s32
) in
[7])) * ((s32
) in
[7]) +
382 ((limb
) ((s32
) in
[6])) * ((s32
) in
[8]) +
383 2 * ((limb
) ((s32
) in
[5])) * ((s32
) in
[9]));
384 output
[15] = 2 * (((limb
) ((s32
) in
[7])) * ((s32
) in
[8]) +
385 ((limb
) ((s32
) in
[6])) * ((s32
) in
[9]));
386 output
[16] = ((limb
) ((s32
) in
[8])) * ((s32
) in
[8]) +
387 4 * ((limb
) ((s32
) in
[7])) * ((s32
) in
[9]);
388 output
[17] = 2 * ((limb
) ((s32
) in
[8])) * ((s32
) in
[9]);
389 output
[18] = 2 * ((limb
) ((s32
) in
[9])) * ((s32
) in
[9]);
393 fsquare(limb
*output
, const limb
*in
) {
395 fsquare_inner(t
, in
);
397 freduce_coefficients(t
);
398 memcpy(output
, t
, sizeof(limb
) * 10);
401 /* Take a little-endian, 32-byte number and expand it into polynomial form */
403 fexpand(limb
*output
, const u8
*input
) {
404 #define F(n,start,shift,mask) \
405 output[n] = ((((limb) input[start + 0]) | \
406 ((limb) input[start + 1]) << 8 | \
407 ((limb) input[start + 2]) << 16 | \
408 ((limb) input[start + 3]) << 24) >> shift) & mask;
409 F(0, 0, 0, 0x3ffffff);
410 F(1, 3, 2, 0x1ffffff);
411 F(2, 6, 3, 0x3ffffff);
412 F(3, 9, 5, 0x1ffffff);
413 F(4, 12, 6, 0x3ffffff);
414 F(5, 16, 0, 0x1ffffff);
415 F(6, 19, 1, 0x3ffffff);
416 F(7, 22, 3, 0x1ffffff);
417 F(8, 25, 4, 0x3ffffff);
418 F(9, 28, 6, 0x1ffffff);
422 #if (-32 >> 1) != -16
423 #error "This code only works when >> does sign-extension on negative numbers"
426 /* Take a fully reduced polynomial form number and contract it into a
427 * little-endian, 32-byte array
430 fcontract(u8
*output
, limb
*input
) {
434 for (j
= 0; j
< 2; ++j
) {
435 for (i
= 0; i
< 9; ++i
) {
437 /* This calculation is a time-invariant way to make input[i] positive
438 by borrowing from the next-larger limb.
440 const s32 mask
= (s32
)(input
[i
]) >> 31;
441 const s32 carry
= -(((s32
)(input
[i
]) & mask
) >> 25);
442 input
[i
] = (s32
)(input
[i
]) + (carry
<< 25);
443 input
[i
+1] = (s32
)(input
[i
+1]) - carry
;
445 const s32 mask
= (s32
)(input
[i
]) >> 31;
446 const s32 carry
= -(((s32
)(input
[i
]) & mask
) >> 26);
447 input
[i
] = (s32
)(input
[i
]) + (carry
<< 26);
448 input
[i
+1] = (s32
)(input
[i
+1]) - carry
;
452 const s32 mask
= (s32
)(input
[9]) >> 31;
453 const s32 carry
= -(((s32
)(input
[9]) & mask
) >> 25);
454 input
[9] = (s32
)(input
[9]) + (carry
<< 25);
455 input
[0] = (s32
)(input
[0]) - (carry
* 19);
459 /* The first borrow-propagation pass above ended with every limb
460 except (possibly) input[0] non-negative.
462 Since each input limb except input[0] is decreased by at most 1
463 by a borrow-propagation pass, the second borrow-propagation pass
464 could only have wrapped around to decrease input[0] again if the
465 first pass left input[0] negative *and* input[1] through input[9]
466 were all zero. In that case, input[1] is now 2^25 - 1, and this
467 last borrow-propagation step will leave input[1] non-negative.
470 const s32 mask
= (s32
)(input
[0]) >> 31;
471 const s32 carry
= -(((s32
)(input
[0]) & mask
) >> 26);
472 input
[0] = (s32
)(input
[0]) + (carry
<< 26);
473 input
[1] = (s32
)(input
[1]) - carry
;
476 /* Both passes through the above loop, plus the last 0-to-1 step, are
477 necessary: if input[9] is -1 and input[0] through input[8] are 0,
478 negative values will remain in the array until the end.
490 output[s+0] |= input[i] & 0xff; \
491 output[s+1] = (input[i] >> 8) & 0xff; \
492 output[s+2] = (input[i] >> 16) & 0xff; \
493 output[s+3] = (input[i] >> 24) & 0xff;
509 /* Input: Q, Q', Q-Q'
514 * x z: short form, destroyed
515 * xprime zprime: short form, destroyed
516 * qmqp: short form, preserved
518 static void fmonty(limb
*x2
, limb
*z2
, /* output 2Q */
519 limb
*x3
, limb
*z3
, /* output Q + Q' */
520 limb
*x
, limb
*z
, /* input Q */
521 limb
*xprime
, limb
*zprime
, /* input Q' */
522 const limb
*qmqp
/* input Q - Q' */) {
523 limb origx
[10], origxprime
[10], zzz
[19], xx
[19], zz
[19], xxprime
[19],
524 zzprime
[19], zzzprime
[19], xxxprime
[19];
526 memcpy(origx
, x
, 10 * sizeof(limb
));
528 fdifference(z
, origx
); // does x - z
530 memcpy(origxprime
, xprime
, sizeof(limb
) * 10);
531 fsum(xprime
, zprime
);
532 fdifference(zprime
, origxprime
);
533 fproduct(xxprime
, xprime
, z
);
534 fproduct(zzprime
, x
, zprime
);
535 freduce_degree(xxprime
);
536 freduce_coefficients(xxprime
);
537 freduce_degree(zzprime
);
538 freduce_coefficients(zzprime
);
539 memcpy(origxprime
, xxprime
, sizeof(limb
) * 10);
540 fsum(xxprime
, zzprime
);
541 fdifference(zzprime
, origxprime
);
542 fsquare(xxxprime
, xxprime
);
543 fsquare(zzzprime
, zzprime
);
544 fproduct(zzprime
, zzzprime
, qmqp
);
545 freduce_degree(zzprime
);
546 freduce_coefficients(zzprime
);
547 memcpy(x3
, xxxprime
, sizeof(limb
) * 10);
548 memcpy(z3
, zzprime
, sizeof(limb
) * 10);
552 fproduct(x2
, xx
, zz
);
554 freduce_coefficients(x2
);
555 fdifference(zz
, xx
); // does zz = xx - zz
556 memset(zzz
+ 10, 0, sizeof(limb
) * 9);
557 fscalar_product(zzz
, zz
, 121665);
558 /* No need to call freduce_degree here:
559 fscalar_product doesn't increase the degree of its input. */
560 freduce_coefficients(zzz
);
562 fproduct(z2
, zz
, zzz
);
564 freduce_coefficients(z2
);
567 /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
568 * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid
569 * side-channel attacks.
571 * NOTE that this function requires that 'iswap' be 1 or 0; other values give
572 * wrong results. Also, the two limb arrays must be in reduced-coefficient,
573 * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
574 * and all all values in a[0..9],b[0..9] must have magnitude less than
578 swap_conditional(limb a
[19], limb b
[19], limb iswap
) {
580 const s32 swap
= (s32
) -iswap
;
582 for (i
= 0; i
< 10; ++i
) {
583 const s32 x
= swap
& ( ((s32
)a
[i
]) ^ ((s32
)b
[i
]) );
584 a
[i
] = ((s32
)a
[i
]) ^ x
;
585 b
[i
] = ((s32
)b
[i
]) ^ x
;
589 /* Calculates nQ where Q is the x-coordinate of a point on the curve
591 * resultx/resultz: the x coordinate of the resulting curve point (short form)
592 * n: a little endian, 32-byte number
593 * q: a point of the curve (short form)
596 cmult(limb
*resultx
, limb
*resultz
, const u8
*n
, const limb
*q
) {
597 limb a
[19] = {0}, b
[19] = {1}, c
[19] = {1}, d
[19] = {0};
598 limb
*nqpqx
= a
, *nqpqz
= b
, *nqx
= c
, *nqz
= d
, *t
;
599 limb e
[19] = {0}, f
[19] = {1}, g
[19] = {0}, h
[19] = {1};
600 limb
*nqpqx2
= e
, *nqpqz2
= f
, *nqx2
= g
, *nqz2
= h
;
604 memcpy(nqpqx
, q
, sizeof(limb
) * 10);
606 for (i
= 0; i
< 32; ++i
) {
608 for (j
= 0; j
< 8; ++j
) {
609 const limb bit
= byte
>> 7;
611 swap_conditional(nqx
, nqpqx
, bit
);
612 swap_conditional(nqz
, nqpqz
, bit
);
618 swap_conditional(nqx2
, nqpqx2
, bit
);
619 swap_conditional(nqz2
, nqpqz2
, bit
);
638 memcpy(resultx
, nqx
, sizeof(limb
) * 10);
639 memcpy(resultz
, nqz
, sizeof(limb
) * 10);
642 // -----------------------------------------------------------------------------
643 // Shamelessly copied from djb's code
644 // -----------------------------------------------------------------------------
646 crecip(limb
*out
, const limb
*z
) {
659 /* 2 */ fsquare(z2
,z
);
660 /* 4 */ fsquare(t1
,z2
);
661 /* 8 */ fsquare(t0
,t1
);
662 /* 9 */ fmul(z9
,t0
,z
);
663 /* 11 */ fmul(z11
,z9
,z2
);
664 /* 22 */ fsquare(t0
,z11
);
665 /* 2^5 - 2^0 = 31 */ fmul(z2_5_0
,t0
,z9
);
667 /* 2^6 - 2^1 */ fsquare(t0
,z2_5_0
);
668 /* 2^7 - 2^2 */ fsquare(t1
,t0
);
669 /* 2^8 - 2^3 */ fsquare(t0
,t1
);
670 /* 2^9 - 2^4 */ fsquare(t1
,t0
);
671 /* 2^10 - 2^5 */ fsquare(t0
,t1
);
672 /* 2^10 - 2^0 */ fmul(z2_10_0
,t0
,z2_5_0
);
674 /* 2^11 - 2^1 */ fsquare(t0
,z2_10_0
);
675 /* 2^12 - 2^2 */ fsquare(t1
,t0
);
676 /* 2^20 - 2^10 */ for (i
= 2;i
< 10;i
+= 2) { fsquare(t0
,t1
); fsquare(t1
,t0
); }
677 /* 2^20 - 2^0 */ fmul(z2_20_0
,t1
,z2_10_0
);
679 /* 2^21 - 2^1 */ fsquare(t0
,z2_20_0
);
680 /* 2^22 - 2^2 */ fsquare(t1
,t0
);
681 /* 2^40 - 2^20 */ for (i
= 2;i
< 20;i
+= 2) { fsquare(t0
,t1
); fsquare(t1
,t0
); }
682 /* 2^40 - 2^0 */ fmul(t0
,t1
,z2_20_0
);
684 /* 2^41 - 2^1 */ fsquare(t1
,t0
);
685 /* 2^42 - 2^2 */ fsquare(t0
,t1
);
686 /* 2^50 - 2^10 */ for (i
= 2;i
< 10;i
+= 2) { fsquare(t1
,t0
); fsquare(t0
,t1
); }
687 /* 2^50 - 2^0 */ fmul(z2_50_0
,t0
,z2_10_0
);
689 /* 2^51 - 2^1 */ fsquare(t0
,z2_50_0
);
690 /* 2^52 - 2^2 */ fsquare(t1
,t0
);
691 /* 2^100 - 2^50 */ for (i
= 2;i
< 50;i
+= 2) { fsquare(t0
,t1
); fsquare(t1
,t0
); }
692 /* 2^100 - 2^0 */ fmul(z2_100_0
,t1
,z2_50_0
);
694 /* 2^101 - 2^1 */ fsquare(t1
,z2_100_0
);
695 /* 2^102 - 2^2 */ fsquare(t0
,t1
);
696 /* 2^200 - 2^100 */ for (i
= 2;i
< 100;i
+= 2) { fsquare(t1
,t0
); fsquare(t0
,t1
); }
697 /* 2^200 - 2^0 */ fmul(t1
,t0
,z2_100_0
);
699 /* 2^201 - 2^1 */ fsquare(t0
,t1
);
700 /* 2^202 - 2^2 */ fsquare(t1
,t0
);
701 /* 2^250 - 2^50 */ for (i
= 2;i
< 50;i
+= 2) { fsquare(t0
,t1
); fsquare(t1
,t0
); }
702 /* 2^250 - 2^0 */ fmul(t0
,t1
,z2_50_0
);
704 /* 2^251 - 2^1 */ fsquare(t1
,t0
);
705 /* 2^252 - 2^2 */ fsquare(t0
,t1
);
706 /* 2^253 - 2^3 */ fsquare(t1
,t0
);
707 /* 2^254 - 2^4 */ fsquare(t0
,t1
);
708 /* 2^255 - 2^5 */ fsquare(t1
,t0
);
709 /* 2^255 - 21 */ fmul(out
,t1
,z11
);
712 int curve25519_donna(u8
*, const u8
*, const u8
*);
715 curve25519_donna(u8
*mypublic
, const u8
*secret
, const u8
*basepoint
) {
716 limb bp
[10], x
[10], z
[11], zmone
[10];
720 for (i
= 0; i
< 32; ++i
) e
[i
] = secret
[i
];
725 fexpand(bp
, basepoint
);
729 freduce_coefficients(z
);
730 fcontract(mypublic
, z
);