1 /* Author: Peter Schwabe, ported from an assembly implementation by Emilia Käsper
9 #define load32_bigendian crypto_stream_aes128ctr_portable_load32_bigendian
10 uint32
load32_bigendian(const unsigned char *x
);
12 #define store32_bigendian crypto_stream_aes128ctr_portable_store32_bigendian
13 void store32_bigendian(unsigned char *x
,uint32 u
);
15 #define load32_littleendian crypto_stream_aes128ctr_portable_load32_littleendian
16 uint32
load32_littleendian(const unsigned char *x
);
18 #define store32_littleendian crypto_stream_aes128ctr_portable_store32_littleendian
19 void store32_littleendian(unsigned char *x
,uint32 u
);
21 #define load64_littleendian crypto_stream_aes128ctr_portable_load64_littleendian
22 uint64
load64_littleendian(const unsigned char *x
);
24 #define store64_littleendian crypto_stream_aes128ctr_portable_store64_littleendian
25 void store64_littleendian(unsigned char *x
,uint64 u
);
27 /* Macros required only for key expansion */
29 #define keyexpbs1(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7, bskey) \
39 sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7);\
52 t0 = *(int128 *)(bskey + 0);\
53 t1 = *(int128 *)(bskey + 16);\
54 t2 = *(int128 *)(bskey + 32);\
55 t3 = *(int128 *)(bskey + 48);\
56 t4 = *(int128 *)(bskey + 64);\
57 t5 = *(int128 *)(bskey + 80);\
58 t6 = *(int128 *)(bskey + 96);\
59 t7 = *(int128 *)(bskey + 112);\
70 rshift32_littleendian(&t0, 8);\
71 rshift32_littleendian(&t1, 8);\
72 rshift32_littleendian(&t2, 8);\
73 rshift32_littleendian(&t3, 8);\
74 rshift32_littleendian(&t4, 8);\
75 rshift32_littleendian(&t5, 8);\
76 rshift32_littleendian(&t6, 8);\
77 rshift32_littleendian(&t7, 8);\
88 rshift32_littleendian(&t0, 8);\
89 rshift32_littleendian(&t1, 8);\
90 rshift32_littleendian(&t2, 8);\
91 rshift32_littleendian(&t3, 8);\
92 rshift32_littleendian(&t4, 8);\
93 rshift32_littleendian(&t5, 8);\
94 rshift32_littleendian(&t6, 8);\
95 rshift32_littleendian(&t7, 8);\
106 rshift32_littleendian(&t0, 8);\
107 rshift32_littleendian(&t1, 8);\
108 rshift32_littleendian(&t2, 8);\
109 rshift32_littleendian(&t3, 8);\
110 rshift32_littleendian(&t4, 8);\
111 rshift32_littleendian(&t5, 8);\
112 rshift32_littleendian(&t6, 8);\
113 rshift32_littleendian(&t7, 8);\
124 *(int128 *)(bskey + 128) = b0;\
125 *(int128 *)(bskey + 144) = b1;\
126 *(int128 *)(bskey + 160) = b4;\
127 *(int128 *)(bskey + 176) = b6;\
128 *(int128 *)(bskey + 192) = b3;\
129 *(int128 *)(bskey + 208) = b7;\
130 *(int128 *)(bskey + 224) = b2;\
131 *(int128 *)(bskey + 240) = b5;\
133 #define keyexpbs10(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7, bskey) ;\
147 sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7);\
162 t0 = *(int128 *)(bskey + 9 * 128 + 0);\
163 t1 = *(int128 *)(bskey + 9 * 128 + 16);\
164 t2 = *(int128 *)(bskey + 9 * 128 + 32);\
165 t3 = *(int128 *)(bskey + 9 * 128 + 48);\
166 t4 = *(int128 *)(bskey + 9 * 128 + 64);\
167 t5 = *(int128 *)(bskey + 9 * 128 + 80);\
168 t6 = *(int128 *)(bskey + 9 * 128 + 96);\
169 t7 = *(int128 *)(bskey + 9 * 128 + 112);\
185 rshift32_littleendian(&t0, 8);\
186 rshift32_littleendian(&t1, 8);\
187 rshift32_littleendian(&t2, 8);\
188 rshift32_littleendian(&t3, 8);\
189 rshift32_littleendian(&t4, 8);\
190 rshift32_littleendian(&t5, 8);\
191 rshift32_littleendian(&t6, 8);\
192 rshift32_littleendian(&t7, 8);\
203 rshift32_littleendian(&t0, 8);\
204 rshift32_littleendian(&t1, 8);\
205 rshift32_littleendian(&t2, 8);\
206 rshift32_littleendian(&t3, 8);\
207 rshift32_littleendian(&t4, 8);\
208 rshift32_littleendian(&t5, 8);\
209 rshift32_littleendian(&t6, 8);\
210 rshift32_littleendian(&t7, 8);\
221 rshift32_littleendian(&t0, 8);\
222 rshift32_littleendian(&t1, 8);\
223 rshift32_littleendian(&t2, 8);\
224 rshift32_littleendian(&t3, 8);\
225 rshift32_littleendian(&t4, 8);\
226 rshift32_littleendian(&t5, 8);\
227 rshift32_littleendian(&t6, 8);\
228 rshift32_littleendian(&t7, 8);\
248 *(int128 *)(bskey + 1280) = b0;\
249 *(int128 *)(bskey + 1296) = b1;\
250 *(int128 *)(bskey + 1312) = b4;\
251 *(int128 *)(bskey + 1328) = b6;\
252 *(int128 *)(bskey + 1344) = b3;\
253 *(int128 *)(bskey + 1360) = b7;\
254 *(int128 *)(bskey + 1376) = b2;\
255 *(int128 *)(bskey + 1392) = b5;\
258 #define keyexpbs(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7, rcon, i, bskey) \
272 sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7);\
284 t0 = *(int128 *)(bskey + (i-1) * 128 + 0);\
285 t1 = *(int128 *)(bskey + (i-1) * 128 + 16);\
286 t2 = *(int128 *)(bskey + (i-1) * 128 + 32);\
287 t3 = *(int128 *)(bskey + (i-1) * 128 + 48);\
288 t4 = *(int128 *)(bskey + (i-1) * 128 + 64);\
289 t5 = *(int128 *)(bskey + (i-1) * 128 + 80);\
290 t6 = *(int128 *)(bskey + (i-1) * 128 + 96);\
291 t7 = *(int128 *)(bskey + (i-1) * 128 + 112);\
307 rshift32_littleendian(&t0, 8);\
308 rshift32_littleendian(&t1, 8);\
309 rshift32_littleendian(&t2, 8);\
310 rshift32_littleendian(&t3, 8);\
311 rshift32_littleendian(&t4, 8);\
312 rshift32_littleendian(&t5, 8);\
313 rshift32_littleendian(&t6, 8);\
314 rshift32_littleendian(&t7, 8);\
325 rshift32_littleendian(&t0, 8);\
326 rshift32_littleendian(&t1, 8);\
327 rshift32_littleendian(&t2, 8);\
328 rshift32_littleendian(&t3, 8);\
329 rshift32_littleendian(&t4, 8);\
330 rshift32_littleendian(&t5, 8);\
331 rshift32_littleendian(&t6, 8);\
332 rshift32_littleendian(&t7, 8);\
343 rshift32_littleendian(&t0, 8);\
344 rshift32_littleendian(&t1, 8);\
345 rshift32_littleendian(&t2, 8);\
346 rshift32_littleendian(&t3, 8);\
347 rshift32_littleendian(&t4, 8);\
348 rshift32_littleendian(&t5, 8);\
349 rshift32_littleendian(&t6, 8);\
350 rshift32_littleendian(&t7, 8);\
361 *(int128 *)(bskey + i*128 + 0) = b0;\
362 *(int128 *)(bskey + i*128 + 16) = b1;\
363 *(int128 *)(bskey + i*128 + 32) = b4;\
364 *(int128 *)(bskey + i*128 + 48) = b6;\
365 *(int128 *)(bskey + i*128 + 64) = b3;\
366 *(int128 *)(bskey + i*128 + 80) = b7;\
367 *(int128 *)(bskey + i*128 + 96) = b2;\
368 *(int128 *)(bskey + i*128 + 112) = b5;\
370 /* Macros used in multiple contexts */
372 #define bitslicekey0(key, bskey) \
373 xmm0 = *(int128 *) (key + 0);\
375 copy2(&xmm1, &xmm0);\
376 copy2(&xmm2, &xmm0);\
377 copy2(&xmm3, &xmm0);\
378 copy2(&xmm4, &xmm0);\
379 copy2(&xmm5, &xmm0);\
380 copy2(&xmm6, &xmm0);\
381 copy2(&xmm7, &xmm0);\
383 bitslice(xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0, t);\
385 *(int128 *) (bskey + 0) = xmm0;\
386 *(int128 *) (bskey + 16) = xmm1;\
387 *(int128 *) (bskey + 32) = xmm2;\
388 *(int128 *) (bskey + 48) = xmm3;\
389 *(int128 *) (bskey + 64) = xmm4;\
390 *(int128 *) (bskey + 80) = xmm5;\
391 *(int128 *) (bskey + 96) = xmm6;\
392 *(int128 *) (bskey + 112) = xmm7;\
395 #define bitslicekey10(key, bskey) \
396 xmm0 = *(int128 *) (key + 0);\
405 bitslice(xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0, t);\
412 *(int128 *) (bskey + 0 + 1280) = xmm0;\
413 *(int128 *) (bskey + 16 + 1280) = xmm1;\
414 *(int128 *) (bskey + 32 + 1280) = xmm2;\
415 *(int128 *) (bskey + 48 + 1280) = xmm3;\
416 *(int128 *) (bskey + 64 + 1280) = xmm4;\
417 *(int128 *) (bskey + 80 + 1280) = xmm5;\
418 *(int128 *) (bskey + 96 + 1280) = xmm6;\
419 *(int128 *) (bskey + 112 + 1280) = xmm7;\
422 #define bitslicekey(i,key,bskey) \
423 xmm0 = *(int128 *) (key + 0);\
425 copy2(&xmm1, &xmm0);\
426 copy2(&xmm2, &xmm0);\
427 copy2(&xmm3, &xmm0);\
428 copy2(&xmm4, &xmm0);\
429 copy2(&xmm5, &xmm0);\
430 copy2(&xmm6, &xmm0);\
431 copy2(&xmm7, &xmm0);\
433 bitslice(xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0, t);\
440 *(int128 *) (bskey + 0 + 128*i) = xmm0;\
441 *(int128 *) (bskey + 16 + 128*i) = xmm1;\
442 *(int128 *) (bskey + 32 + 128*i) = xmm2;\
443 *(int128 *) (bskey + 48 + 128*i) = xmm3;\
444 *(int128 *) (bskey + 64 + 128*i) = xmm4;\
445 *(int128 *) (bskey + 80 + 128*i) = xmm5;\
446 *(int128 *) (bskey + 96 + 128*i) = xmm6;\
447 *(int128 *) (bskey + 112 + 128*i) = xmm7;\
450 #define bitslice(x0, x1, x2, x3, x4, x5, x6, x7, t) \
451 swapmove(x0, x1, 1, BS0, t);\
452 swapmove(x2, x3, 1, BS0, t);\
453 swapmove(x4, x5, 1, BS0, t);\
454 swapmove(x6, x7, 1, BS0, t);\
456 swapmove(x0, x2, 2, BS1, t);\
457 swapmove(x1, x3, 2, BS1, t);\
458 swapmove(x4, x6, 2, BS1, t);\
459 swapmove(x5, x7, 2, BS1, t);\
461 swapmove(x0, x4, 4, BS2, t);\
462 swapmove(x1, x5, 4, BS2, t);\
463 swapmove(x2, x6, 4, BS2, t);\
464 swapmove(x3, x7, 4, BS2, t);\
467 #define swapmove(a, b, n, m, t) \
469 rshift64_littleendian(&t, n);\
473 lshift64_littleendian(&t, n);\
477 shufb(x, ROTB) /* TODO: Make faster */
480 /* Macros used for encryption (and decryption) */
482 #define shiftrows(x0, x1, x2, x3, x4, x5, x6, x7, i, M, bskey) \
483 xor2(&x0, (int128 *)(bskey + 128*(i-1) + 0));\
485 xor2(&x1, (int128 *)(bskey + 128*(i-1) + 16));\
487 xor2(&x2, (int128 *)(bskey + 128*(i-1) + 32));\
489 xor2(&x3, (int128 *)(bskey + 128*(i-1) + 48));\
491 xor2(&x4, (int128 *)(bskey + 128*(i-1) + 64));\
493 xor2(&x5, (int128 *)(bskey + 128*(i-1) + 80));\
495 xor2(&x6, (int128 *)(bskey + 128*(i-1) + 96));\
497 xor2(&x7, (int128 *)(bskey + 128*(i-1) + 112));\
501 #define mixcolumns(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, t7) \
502 shufd(&t0, &x0, 0x93);\
503 shufd(&t1, &x1, 0x93);\
504 shufd(&t2, &x2, 0x93);\
505 shufd(&t3, &x3, 0x93);\
506 shufd(&t4, &x4, 0x93);\
507 shufd(&t5, &x5, 0x93);\
508 shufd(&t6, &x6, 0x93);\
509 shufd(&t7, &x7, 0x93);\
532 shufd(&x0, &x0, 0x4e);\
533 shufd(&x1, &x1, 0x4e);\
534 shufd(&x2, &x2, 0x4e);\
535 shufd(&x3, &x3, 0x4e);\
536 shufd(&x4, &x4, 0x4e);\
537 shufd(&x5, &x5, 0x4e);\
538 shufd(&x6, &x6, 0x4e);\
539 shufd(&x7, &x7, 0x4e);\
551 #define aesround(i, b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7, bskey) \
552 shiftrows(b0, b1, b2, b3, b4, b5, b6, b7, i, SR, bskey);\
553 sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7);\
554 mixcolumns(b0, b1, b4, b6, b3, b7, b2, b5, t0, t1, t2, t3, t4, t5, t6, t7);\
557 #define lastround(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7, bskey) \
558 shiftrows(b0, b1, b2, b3, b4, b5, b6, b7, 10, SRM0, bskey);\
559 sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7);\
560 xor2(&b0,(int128 *)(bskey + 128*10));\
561 xor2(&b1,(int128 *)(bskey + 128*10+16));\
562 xor2(&b4,(int128 *)(bskey + 128*10+32));\
563 xor2(&b6,(int128 *)(bskey + 128*10+48));\
564 xor2(&b3,(int128 *)(bskey + 128*10+64));\
565 xor2(&b7,(int128 *)(bskey + 128*10+80));\
566 xor2(&b2,(int128 *)(bskey + 128*10+96));\
567 xor2(&b5,(int128 *)(bskey + 128*10+112));\
570 #define sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, s0, s1, s2, s3) \
571 InBasisChange(b0, b1, b2, b3, b4, b5, b6, b7); \
572 Inv_GF256(b6, b5, b0, b3, b7, b1, b4, b2, t0, t1, t2, t3, s0, s1, s2, s3); \
573 OutBasisChange(b7, b1, b4, b2, b6, b5, b0, b3); \
576 #define InBasisChange(b0, b1, b2, b3, b4, b5, b6, b7) \
593 #define OutBasisChange(b0, b1, b2, b3, b4, b5, b6, b7) \
608 #define Mul_GF4(x0, x1, y0, y1, t0) \
618 #define Mul_GF4_N(x0, x1, y0, y1, t0) \
628 #define Mul_GF4_2(x0, x1, x2, x3, y0, y1, t0, t1) \
645 #define Mul_GF16(x0, x1, x2, x3, y0, y1, y2, y3, t0, t1, t2, t3) \
648 Mul_GF4(x0, x1, y0, y1, t2);\
653 Mul_GF4_N(t0, t1, y0, y1, t2);\
654 Mul_GF4(x2, x3, y2, y3, t3);\
661 #define Mul_GF16_2(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, t0, t1, t2, t3) \
664 Mul_GF4(x0, x1, y0, y1, t2);\
669 Mul_GF4_N(t0, t1, y0, y1, t3);\
670 Mul_GF4(x2, x3, y2, y3, t2);\
681 Mul_GF4_N(t0, t1, y0, y1, t3);\
682 Mul_GF4(x6, x7, y2, y3, t2);\
685 Mul_GF4(x4, x5, y0, y1, t3);\
692 #define Inv_GF16(x0, x1, x2, x3, t0, t1, t2, t3) \
706 Mul_GF4_2(x0, x1, x2, x3, t1, t0, t2, t3);\
709 #define Inv_GF256(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, s0, s1, s2, s3) \
786 Mul_GF16_2(x0, x1, x2, x3, x4, x5, x6, x7, s3, s2, s1, t1, s0, t0, t2, t3);\