1 /*********************************************************************
2 * Copyright (c) 2016 Pieter Wuille *
3 * Distributed under the MIT software license, see the accompanying *
4 * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
5 **********************************************************************/
7 /* Constant time, unoptimized, concise, plain C, AES implementation
9 * Emilia Kasper and Peter Schwabe, Faster and Timing-Attack Resistant AES-GCM
10 * http://www.iacr.org/archive/ches2009/57470001/57470001.pdf
11 * But using 8 16-bit integers representing a single AES state rather than 8 128-bit
12 * integers representing 8 AES states.
17 /* Slice variable slice_i contains the i'th bit of the 16 state variables in this order:
24 /** Convert a byte to sliced form, storing it corresponding to given row and column in s */
25 static void LoadByte(AES_state
* s
, unsigned char byte
, int r
, int c
) {
27 for (i
= 0; i
< 8; i
++) {
28 s
->slice
[i
] |= (byte
& 1) << (r
* 4 + c
);
33 /** Load 16 bytes of data into 8 sliced integers */
34 static void LoadBytes(AES_state
*s
, const unsigned char* data16
) {
36 for (c
= 0; c
< 4; c
++) {
38 for (r
= 0; r
< 4; r
++) {
39 LoadByte(s
, *(data16
++), r
, c
);
44 /** Convert 8 sliced integers into 16 bytes of data */
45 static void SaveBytes(unsigned char* data16
, const AES_state
*s
) {
47 for (c
= 0; c
< 4; c
++) {
49 for (r
= 0; r
< 4; r
++) {
52 for (b
= 0; b
< 8; b
++) {
53 v
|= ((s
->slice
[b
] >> (r
* 4 + c
)) & 1) << b
;
60 /* S-box implementation based on the gate logic from:
61 * Joan Boyar and Rene Peralta, A depth-16 circuit for the AES S-box.
62 * https://eprint.iacr.org/2011/332.pdf
64 static void SubBytes(AES_state
*s
, int inv
) {
65 /* Load the bit slices */
66 uint16_t U0
= s
->slice
[7], U1
= s
->slice
[6], U2
= s
->slice
[5], U3
= s
->slice
[4];
67 uint16_t U4
= s
->slice
[3], U5
= s
->slice
[2], U6
= s
->slice
[1], U7
= s
->slice
[0];
69 uint16_t T1
, T2
, T3
, T4
, T5
, T6
, T7
, T8
, T9
, T10
, T11
, T12
, T13
, T14
, T15
, T16
;
70 uint16_t T17
, T18
, T19
, T20
, T21
, T22
, T23
, T24
, T25
, T26
, T27
, D
;
71 uint16_t M1
, M6
, M11
, M13
, M15
, M20
, M21
, M22
, M23
, M25
, M37
, M38
, M39
, M40
;
72 uint16_t M41
, M42
, M43
, M44
, M45
, M46
, M47
, M48
, M49
, M50
, M51
, M52
, M53
, M54
;
73 uint16_t M55
, M56
, M57
, M58
, M59
, M60
, M61
, M62
, M63
;
76 uint16_t R5
, R13
, R17
, R18
, R19
;
77 /* Undo linear postprocessing */
106 /* Linear preprocessing. */
137 /* Non-linear transformation (shared between the forward and backward case) */
141 M13
= (T4
& T27
) ^ M11
;
142 M15
= (T2
& T10
) ^ M11
;
143 M20
= T14
^ M1
^ (T23
& T8
) ^ M13
;
144 M21
= (T19
& D
) ^ M1
^ T24
^ M15
;
145 M22
= T26
^ M6
^ (T22
& T9
) ^ M13
;
146 M23
= (T20
& T17
) ^ M6
^ M15
^ T25
;
148 M37
= M21
^ ((M20
^ M21
) & (M23
^ M25
));
149 M38
= M20
^ M25
^ (M21
| (M20
& M23
));
150 M39
= M23
^ ((M22
^ M23
) & (M21
^ M25
));
151 M40
= M22
^ M25
^ (M23
| (M21
& M22
));
177 /* Undo linear preprocessing */
178 uint16_t P0
= M52
^ M61
;
179 uint16_t P1
= M58
^ M59
;
180 uint16_t P2
= M54
^ M62
;
181 uint16_t P3
= M47
^ M50
;
182 uint16_t P4
= M48
^ M56
;
183 uint16_t P5
= M46
^ M51
;
184 uint16_t P6
= M49
^ M60
;
185 uint16_t P7
= P0
^ P1
;
186 uint16_t P8
= M50
^ M53
;
187 uint16_t P9
= M55
^ M63
;
188 uint16_t P10
= M57
^ P4
;
189 uint16_t P11
= P0
^ P3
;
190 uint16_t P12
= M46
^ M48
;
191 uint16_t P13
= M49
^ M51
;
192 uint16_t P14
= M49
^ M62
;
193 uint16_t P15
= M54
^ M59
;
194 uint16_t P16
= M57
^ M61
;
195 uint16_t P17
= M58
^ P2
;
196 uint16_t P18
= M63
^ P5
;
197 uint16_t P19
= P2
^ P3
;
198 uint16_t P20
= P4
^ P6
;
199 uint16_t P22
= P2
^ P7
;
200 uint16_t P23
= P7
^ P8
;
201 uint16_t P24
= P5
^ P7
;
202 uint16_t P25
= P6
^ P10
;
203 uint16_t P26
= P9
^ P11
;
204 uint16_t P27
= P10
^ P18
;
205 uint16_t P28
= P11
^ P25
;
206 uint16_t P29
= P15
^ P20
;
207 s
->slice
[7] = P13
^ P22
;
208 s
->slice
[6] = P26
^ P29
;
209 s
->slice
[5] = P17
^ P28
;
210 s
->slice
[4] = P12
^ P22
;
211 s
->slice
[3] = P23
^ P27
;
212 s
->slice
[2] = P19
^ P24
;
213 s
->slice
[1] = P14
^ P23
;
214 s
->slice
[0] = P9
^ P16
;
216 /* Linear postprocessing */
217 uint16_t L0
= M61
^ M62
;
218 uint16_t L1
= M50
^ M56
;
219 uint16_t L2
= M46
^ M48
;
220 uint16_t L3
= M47
^ M55
;
221 uint16_t L4
= M54
^ M58
;
222 uint16_t L5
= M49
^ M61
;
223 uint16_t L6
= M62
^ L5
;
224 uint16_t L7
= M46
^ L3
;
225 uint16_t L8
= M51
^ M59
;
226 uint16_t L9
= M52
^ M53
;
227 uint16_t L10
= M53
^ L4
;
228 uint16_t L11
= M60
^ L2
;
229 uint16_t L12
= M48
^ M51
;
230 uint16_t L13
= M50
^ L0
;
231 uint16_t L14
= M52
^ M61
;
232 uint16_t L15
= M55
^ L1
;
233 uint16_t L16
= M56
^ L0
;
234 uint16_t L17
= M57
^ L1
;
235 uint16_t L18
= M58
^ L8
;
236 uint16_t L19
= M63
^ L4
;
237 uint16_t L20
= L0
^ L1
;
238 uint16_t L21
= L1
^ L7
;
239 uint16_t L22
= L3
^ L12
;
240 uint16_t L23
= L18
^ L2
;
241 uint16_t L24
= L15
^ L9
;
242 uint16_t L25
= L6
^ L10
;
243 uint16_t L26
= L7
^ L9
;
244 uint16_t L27
= L8
^ L10
;
245 uint16_t L28
= L11
^ L14
;
246 uint16_t L29
= L11
^ L17
;
247 s
->slice
[7] = L6
^ L24
;
248 s
->slice
[6] = ~(L16
^ L26
);
249 s
->slice
[5] = ~(L19
^ L28
);
250 s
->slice
[4] = L6
^ L21
;
251 s
->slice
[3] = L20
^ L22
;
252 s
->slice
[2] = L25
^ L29
;
253 s
->slice
[1] = ~(L13
^ L27
);
254 s
->slice
[0] = ~(L6
^ L23
);
258 #define BIT_RANGE(from,to) (((1 << ((to) - (from))) - 1) << (from))
260 #define BIT_RANGE_LEFT(x,from,to,shift) (((x) & BIT_RANGE((from), (to))) << (shift))
261 #define BIT_RANGE_RIGHT(x,from,to,shift) (((x) & BIT_RANGE((from), (to))) >> (shift))
263 static void ShiftRows(AES_state
* s
) {
265 for (i
= 0; i
< 8; i
++) {
266 uint16_t v
= s
->slice
[i
];
268 (v
& BIT_RANGE(0, 4)) |
269 BIT_RANGE_LEFT(v
, 4, 5, 3) | BIT_RANGE_RIGHT(v
, 5, 8, 1) |
270 BIT_RANGE_LEFT(v
, 8, 10, 2) | BIT_RANGE_RIGHT(v
, 10, 12, 2) |
271 BIT_RANGE_LEFT(v
, 12, 15, 1) | BIT_RANGE_RIGHT(v
, 15, 16, 3);
275 static void InvShiftRows(AES_state
* s
) {
277 for (i
= 0; i
< 8; i
++) {
278 uint16_t v
= s
->slice
[i
];
280 (v
& BIT_RANGE(0, 4)) |
281 BIT_RANGE_LEFT(v
, 4, 7, 1) | BIT_RANGE_RIGHT(v
, 7, 8, 3) |
282 BIT_RANGE_LEFT(v
, 8, 10, 2) | BIT_RANGE_RIGHT(v
, 10, 12, 2) |
283 BIT_RANGE_LEFT(v
, 12, 13, 3) | BIT_RANGE_RIGHT(v
, 13, 16, 1);
287 #define ROT(x,b) (((x) >> ((b) * 4)) | ((x) << ((4-(b)) * 4)))
289 static void MixColumns(AES_state
* s
, int inv
) {
290 /* The MixColumns transform treats the bytes of the columns of the state as
291 * coefficients of a 3rd degree polynomial over GF(2^8) and multiplies them
292 * by the fixed polynomial a(x) = {03}x^3 + {01}x^2 + {01}x + {02}, modulo
295 * In the inverse transform, we multiply by the inverse of a(x),
296 * a^-1(x) = {0b}x^3 + {0d}x^2 + {09}x + {0e}. This is equal to
297 * a(x) * ({04}x^2 + {05}), so we can reuse the forward transform's code
298 * (found in OpenSSL's bsaes-x86_64.pl, attributed to Jussi Kivilinna)
300 * In the bitsliced representation, a multiplication of every column by x
301 * mod x^4 + 1 is simply a right rotation.
304 /* Shared for both directions is a multiplication by a(x), which can be
305 * rewritten as (x^3 + x^2 + x) + {02}*(x^3 + {01}).
307 * First compute s into the s? variables, (x^3 + {01}) * s into the s?_01
308 * variables and (x^3 + x^2 + x)*s into the s?_123 variables.
310 uint16_t s0
= s
->slice
[0], s1
= s
->slice
[1], s2
= s
->slice
[2], s3
= s
->slice
[3];
311 uint16_t s4
= s
->slice
[4], s5
= s
->slice
[5], s6
= s
->slice
[6], s7
= s
->slice
[7];
312 uint16_t s0_01
= s0
^ ROT(s0
, 1), s0_123
= ROT(s0_01
, 1) ^ ROT(s0
, 3);
313 uint16_t s1_01
= s1
^ ROT(s1
, 1), s1_123
= ROT(s1_01
, 1) ^ ROT(s1
, 3);
314 uint16_t s2_01
= s2
^ ROT(s2
, 1), s2_123
= ROT(s2_01
, 1) ^ ROT(s2
, 3);
315 uint16_t s3_01
= s3
^ ROT(s3
, 1), s3_123
= ROT(s3_01
, 1) ^ ROT(s3
, 3);
316 uint16_t s4_01
= s4
^ ROT(s4
, 1), s4_123
= ROT(s4_01
, 1) ^ ROT(s4
, 3);
317 uint16_t s5_01
= s5
^ ROT(s5
, 1), s5_123
= ROT(s5_01
, 1) ^ ROT(s5
, 3);
318 uint16_t s6_01
= s6
^ ROT(s6
, 1), s6_123
= ROT(s6_01
, 1) ^ ROT(s6
, 3);
319 uint16_t s7_01
= s7
^ ROT(s7
, 1), s7_123
= ROT(s7_01
, 1) ^ ROT(s7
, 3);
320 /* Now compute s = s?_123 + {02} * s?_01. */
321 s
->slice
[0] = s7_01
^ s0_123
;
322 s
->slice
[1] = s7_01
^ s0_01
^ s1_123
;
323 s
->slice
[2] = s1_01
^ s2_123
;
324 s
->slice
[3] = s7_01
^ s2_01
^ s3_123
;
325 s
->slice
[4] = s7_01
^ s3_01
^ s4_123
;
326 s
->slice
[5] = s4_01
^ s5_123
;
327 s
->slice
[6] = s5_01
^ s6_123
;
328 s
->slice
[7] = s6_01
^ s7_123
;
330 /* In the reverse direction, we further need to multiply by
331 * {04}x^2 + {05}, which can be written as {04} * (x^2 + {01}) + {01}.
333 * First compute (x^2 + {01}) * s into the t?_02 variables: */
334 uint16_t t0_02
= s
->slice
[0] ^ ROT(s
->slice
[0], 2);
335 uint16_t t1_02
= s
->slice
[1] ^ ROT(s
->slice
[1], 2);
336 uint16_t t2_02
= s
->slice
[2] ^ ROT(s
->slice
[2], 2);
337 uint16_t t3_02
= s
->slice
[3] ^ ROT(s
->slice
[3], 2);
338 uint16_t t4_02
= s
->slice
[4] ^ ROT(s
->slice
[4], 2);
339 uint16_t t5_02
= s
->slice
[5] ^ ROT(s
->slice
[5], 2);
340 uint16_t t6_02
= s
->slice
[6] ^ ROT(s
->slice
[6], 2);
341 uint16_t t7_02
= s
->slice
[7] ^ ROT(s
->slice
[7], 2);
342 /* And then update s += {04} * t?_02 */
343 s
->slice
[0] ^= t6_02
;
344 s
->slice
[1] ^= t6_02
^ t7_02
;
345 s
->slice
[2] ^= t0_02
^ t7_02
;
346 s
->slice
[3] ^= t1_02
^ t6_02
;
347 s
->slice
[4] ^= t2_02
^ t6_02
^ t7_02
;
348 s
->slice
[5] ^= t3_02
^ t7_02
;
349 s
->slice
[6] ^= t4_02
;
350 s
->slice
[7] ^= t5_02
;
354 static void AddRoundKey(AES_state
* s
, const AES_state
* round
) {
356 for (b
= 0; b
< 8; b
++) {
357 s
->slice
[b
] ^= round
->slice
[b
];
361 /** column_0(s) = column_c(a) */
362 static void GetOneColumn(AES_state
* s
, const AES_state
* a
, int c
) {
364 for (b
= 0; b
< 8; b
++) {
365 s
->slice
[b
] = (a
->slice
[b
] >> c
) & 0x1111;
369 /** column_c1(r) |= (column_0(s) ^= column_c2(a)) */
370 static void KeySetupColumnMix(AES_state
* s
, AES_state
* r
, const AES_state
* a
, int c1
, int c2
) {
372 for (b
= 0; b
< 8; b
++) {
373 r
->slice
[b
] |= ((s
->slice
[b
] ^= ((a
->slice
[b
] >> c2
) & 0x1111)) & 0x1111) << c1
;
377 /** Rotate the rows in s one position upwards, and xor in r */
378 static void KeySetupTransform(AES_state
* s
, const AES_state
* r
) {
380 for (b
= 0; b
< 8; b
++) {
381 s
->slice
[b
] = ((s
->slice
[b
] >> 4) | (s
->slice
[b
] << 12)) ^ r
->slice
[b
];
385 /* Multiply the cells in s by x, as polynomials over GF(2) mod x^8 + x^4 + x^3 + x + 1 */
386 static void MultX(AES_state
* s
) {
387 uint16_t top
= s
->slice
[7];
388 s
->slice
[7] = s
->slice
[6];
389 s
->slice
[6] = s
->slice
[5];
390 s
->slice
[5] = s
->slice
[4];
391 s
->slice
[4] = s
->slice
[3] ^ top
;
392 s
->slice
[3] = s
->slice
[2] ^ top
;
393 s
->slice
[2] = s
->slice
[1];
394 s
->slice
[1] = s
->slice
[0] ^ top
;
398 /** Expand the cipher key into the key schedule.
400 * state must be a pointer to an array of size nrounds + 1.
401 * key must be a pointer to 4 * nkeywords bytes.
403 * AES128 uses nkeywords = 4, nrounds = 10
404 * AES192 uses nkeywords = 6, nrounds = 12
405 * AES256 uses nkeywords = 8, nrounds = 14
407 static void AES_setup(AES_state
* rounds
, const uint8_t* key
, int nkeywords
, int nrounds
)
411 /* The one-byte round constant */
412 AES_state rcon
= {{1,0,0,0,0,0,0,0}};
413 /* The number of the word being generated, modulo nkeywords */
415 /* The column representing the word currently being processed */
418 for (i
= 0; i
< nrounds
+ 1; i
++) {
420 for (b
= 0; b
< 8; b
++) {
421 rounds
[i
].slice
[b
] = 0;
425 /* The first nkeywords round columns are just taken from the key directly. */
426 for (i
= 0; i
< nkeywords
; i
++) {
428 for (r
= 0; r
< 4; r
++) {
429 LoadByte(&rounds
[i
>> 2], *(key
++), r
, i
& 3);
433 GetOneColumn(&column
, &rounds
[(nkeywords
- 1) >> 2], (nkeywords
- 1) & 3);
435 for (i
= nkeywords
; i
< 4 * (nrounds
+ 1); i
++) {
436 /* Transform column */
438 SubBytes(&column
, 0);
439 KeySetupTransform(&column
, &rcon
);
441 } else if (nkeywords
> 6 && pos
== 4) {
442 SubBytes(&column
, 0);
444 if (++pos
== nkeywords
) pos
= 0;
445 KeySetupColumnMix(&column
, &rounds
[i
>> 2], &rounds
[(i
- nkeywords
) >> 2], i
& 3, (i
- nkeywords
) & 3);
449 static void AES_encrypt(const AES_state
* rounds
, int nrounds
, unsigned char* cipher16
, const unsigned char* plain16
) {
453 LoadBytes(&s
, plain16
);
454 AddRoundKey(&s
, rounds
++);
456 for (round
= 1; round
< nrounds
; round
++) {
460 AddRoundKey(&s
, rounds
++);
465 AddRoundKey(&s
, rounds
);
467 SaveBytes(cipher16
, &s
);
470 static void AES_decrypt(const AES_state
* rounds
, int nrounds
, unsigned char* plain16
, const unsigned char* cipher16
) {
471 /* Most AES decryption implementations use the alternate scheme
472 * (the Equivalent Inverse Cipher), which allows for more code reuse between
473 * the encryption and decryption code, but requires separate setup for both.
480 LoadBytes(&s
, cipher16
);
481 AddRoundKey(&s
, rounds
--);
483 for (round
= 1; round
< nrounds
; round
++) {
486 AddRoundKey(&s
, rounds
--);
492 AddRoundKey(&s
, rounds
);
494 SaveBytes(plain16
, &s
);
497 void AES128_init(AES128_ctx
* ctx
, const unsigned char* key16
) {
498 AES_setup(ctx
->rk
, key16
, 4, 10);
501 void AES128_encrypt(const AES128_ctx
* ctx
, size_t blocks
, unsigned char* cipher16
, const unsigned char* plain16
) {
503 AES_encrypt(ctx
->rk
, 10, cipher16
, plain16
);
509 void AES128_decrypt(const AES128_ctx
* ctx
, size_t blocks
, unsigned char* plain16
, const unsigned char* cipher16
) {
511 AES_decrypt(ctx
->rk
, 10, plain16
, cipher16
);
517 void AES192_init(AES192_ctx
* ctx
, const unsigned char* key24
) {
518 AES_setup(ctx
->rk
, key24
, 6, 12);
521 void AES192_encrypt(const AES192_ctx
* ctx
, size_t blocks
, unsigned char* cipher16
, const unsigned char* plain16
) {
523 AES_encrypt(ctx
->rk
, 12, cipher16
, plain16
);
530 void AES192_decrypt(const AES192_ctx
* ctx
, size_t blocks
, unsigned char* plain16
, const unsigned char* cipher16
) {
532 AES_decrypt(ctx
->rk
, 12, plain16
, cipher16
);
538 void AES256_init(AES256_ctx
* ctx
, const unsigned char* key32
) {
539 AES_setup(ctx
->rk
, key32
, 8, 14);
542 void AES256_encrypt(const AES256_ctx
* ctx
, size_t blocks
, unsigned char* cipher16
, const unsigned char* plain16
) {
544 AES_encrypt(ctx
->rk
, 14, cipher16
, plain16
);
550 void AES256_decrypt(const AES256_ctx
* ctx
, size_t blocks
, unsigned char* plain16
, const unsigned char* cipher16
) {
552 AES_decrypt(ctx
->rk
, 14, plain16
, cipher16
);