src/TortoisePlink/crypto/aes-sw.c

   1 /*
   2  * Software implementation of AES.
   3  *
   4  * This implementation uses a bit-sliced representation. Instead of
   5  * the obvious approach of storing the cipher state so that each byte
   6  * (or field element, or entry in the cipher matrix) occupies 8
   7  * contiguous bits in a machine integer somewhere, we organise the
   8  * cipher state as an array of 8 integers, in such a way that each
   9  * logical byte of the cipher state occupies one bit in each integer,
  10  * all at the same position. This allows us to do parallel logic on
  11  * all bytes of the state by doing bitwise operations between the 8
  12  * integers; in particular, the S-box (SubBytes) lookup is done this
  13  * way, which takes about 110 operations - but for those 110 bitwise
  14  * ops you get 64 S-box lookups, not just one.
  15  */
  16
  17 #include "ssh.h"
  18 #include "aes.h"
  19 #include "mpint_i.h"               /* we reuse the BignumInt system */
  20
  21 static bool aes_sw_available(void)
  22 {
  23     /* Software AES is always available */
  24     return true;
  25 }
  26
  27 #define SLICE_PARALLELISM (BIGNUM_INT_BYTES / 2)
  28
  29 #ifdef BITSLICED_DEBUG
  30 /* Dump function that undoes the bitslicing transform, so you can see
  31  * the logical data represented by a set of slice words. */
  32 static inline void dumpslices_uint16_t(
  33     const char *prefix, const uint16_t slices[8])
  34 {
  35     printf("%-30s", prefix);
  36     for (unsigned byte = 0; byte < 16; byte++) {
  37         unsigned byteval = 0;
  38         for (unsigned bit = 0; bit < 8; bit++)
  39             byteval |= (1 & (slices[bit] >> byte)) << bit;
  40         printf("%02x", byteval);
  41     }
  42     printf("\n");
  43 }
  44
  45 static inline void dumpslices_BignumInt(
  46     const char *prefix, const BignumInt slices[8])
  47 {
  48     printf("%-30s", prefix);
  49     for (unsigned iter = 0; iter < SLICE_PARALLELISM; iter++) {
  50         for (unsigned byte = 0; byte < 16; byte++) {
  51             unsigned byteval = 0;
  52             for (unsigned bit = 0; bit < 8; bit++)
  53                 byteval |= (1 & (slices[bit] >> (iter*16+byte))) << bit;
  54             printf("%02x", byteval);
  55         }
  56         if (iter+1 < SLICE_PARALLELISM)
  57             printf(" ");
  58     }
  59     printf("\n");
  60 }
  61 #else
  62 #define dumpslices_uintN_t(prefix, slices) ((void)0)
  63 #define dumpslices_BignumInt(prefix, slices) ((void)0)
  64 #endif
  65
  66 /* -----
  67  * Bit-slicing transformation: convert between an array of 16 uint8_t
  68  * and an array of 8 uint16_t, so as to interchange the bit index
  69  * within each element and the element index within the array. (That
  70  * is, bit j of input[i] == bit i of output[j].
  71  */
  72
  73 #define SWAPWORDS(shift) do                                     \
  74     {                                                           \
  75         uint64_t mask = ~(uint64_t)0 / ((1ULL << shift) + 1);   \
  76         uint64_t diff = ((i0 >> shift) ^ i1) & mask;            \
  77         i0 ^= diff << shift;                                    \
  78         i1 ^= diff;                                             \
  79     } while (0)
  80
  81 #define SWAPINWORD(i, bigshift, smallshift) do                  \
  82     {                                                           \
  83         uint64_t mask = ~(uint64_t)0;                           \
  84         mask /= ((1ULL << bigshift) + 1);                       \
  85         mask /= ((1ULL << smallshift) + 1);                     \
  86         mask <<= smallshift;                                    \
  87         unsigned shift = bigshift - smallshift;                 \
  88         uint64_t diff = ((i >> shift) ^ i) & mask;              \
  89         i ^= diff ^ (diff << shift);                            \
  90     } while (0)
  91
  92 #define TO_BITSLICES(slices, bytes, uintN_t, assign_op, shift) do       \
  93     {                                                                   \
  94         uint64_t i0 = GET_64BIT_LSB_FIRST(bytes);                       \
  95         uint64_t i1 = GET_64BIT_LSB_FIRST(bytes + 8);                   \
  96         SWAPINWORD(i0, 8, 1);                                           \
  97         SWAPINWORD(i1, 8, 1);                                           \
  98         SWAPINWORD(i0, 16, 2);                                          \
  99         SWAPINWORD(i1, 16, 2);                                          \
 100         SWAPINWORD(i0, 32, 4);                                          \
 101         SWAPINWORD(i1, 32, 4);                                          \
 102         SWAPWORDS(8);                                                   \
 103         slices[0] assign_op (uintN_t)((i0 >>  0) & 0xFFFF) << (shift);  \
 104         slices[2] assign_op (uintN_t)((i0 >> 16) & 0xFFFF) << (shift);  \
 105         slices[4] assign_op (uintN_t)((i0 >> 32) & 0xFFFF) << (shift);  \
 106         slices[6] assign_op (uintN_t)((i0 >> 48) & 0xFFFF) << (shift);  \
 107         slices[1] assign_op (uintN_t)((i1 >>  0) & 0xFFFF) << (shift);  \
 108         slices[3] assign_op (uintN_t)((i1 >> 16) & 0xFFFF) << (shift);  \
 109         slices[5] assign_op (uintN_t)((i1 >> 32) & 0xFFFF) << (shift);  \
 110         slices[7] assign_op (uintN_t)((i1 >> 48) & 0xFFFF) << (shift);  \
 111     } while (0)
 112
 113 #define FROM_BITSLICES(bytes, slices, shift) do                 \
 114     {                                                           \
 115         uint64_t i1 = ((slices[7] >> (shift)) & 0xFFFF);        \
 116         i1 = (i1 << 16) | ((slices[5] >> (shift)) & 0xFFFF);    \
 117         i1 = (i1 << 16) | ((slices[3] >> (shift)) & 0xFFFF);    \
 118         i1 = (i1 << 16) | ((slices[1] >> (shift)) & 0xFFFF);    \
 119         uint64_t i0 = ((slices[6] >> (shift)) & 0xFFFF);        \
 120         i0 = (i0 << 16) | ((slices[4] >> (shift)) & 0xFFFF);    \
 121         i0 = (i0 << 16) | ((slices[2] >> (shift)) & 0xFFFF);    \
 122         i0 = (i0 << 16) | ((slices[0] >> (shift)) & 0xFFFF);    \
 123         SWAPWORDS(8);                                           \
 124         SWAPINWORD(i0, 32, 4);                                  \
 125         SWAPINWORD(i1, 32, 4);                                  \
 126         SWAPINWORD(i0, 16, 2);                                  \
 127         SWAPINWORD(i1, 16, 2);                                  \
 128         SWAPINWORD(i0, 8, 1);                                   \
 129         SWAPINWORD(i1, 8, 1);                                   \
 130         PUT_64BIT_LSB_FIRST(bytes, i0);                         \
 131         PUT_64BIT_LSB_FIRST((bytes) + 8, i1);                   \
 132     } while (0)
 133
 134 /* -----
 135  * Some macros that will be useful repeatedly.
 136  */
 137
 138 /* Iterate a unary transformation over all 8 slices. */
 139 #define ITERATE(MACRO, output, input, uintN_t) do       \
 140     {                                                   \
 141         MACRO(output[0], input[0], uintN_t);            \
 142         MACRO(output[1], input[1], uintN_t);            \
 143         MACRO(output[2], input[2], uintN_t);            \
 144         MACRO(output[3], input[3], uintN_t);            \
 145         MACRO(output[4], input[4], uintN_t);            \
 146         MACRO(output[5], input[5], uintN_t);            \
 147         MACRO(output[6], input[6], uintN_t);            \
 148         MACRO(output[7], input[7], uintN_t);            \
 149     } while (0)
 150
 151 /* Simply add (i.e. XOR) two whole sets of slices together. */
 152 #define BITSLICED_ADD(output, lhs, rhs) do      \
 153     {                                           \
 154         output[0] = lhs[0] ^ rhs[0];            \
 155         output[1] = lhs[1] ^ rhs[1];            \
 156         output[2] = lhs[2] ^ rhs[2];            \
 157         output[3] = lhs[3] ^ rhs[3];            \
 158         output[4] = lhs[4] ^ rhs[4];            \
 159         output[5] = lhs[5] ^ rhs[5];            \
 160         output[6] = lhs[6] ^ rhs[6];            \
 161         output[7] = lhs[7] ^ rhs[7];            \
 162     } while (0)
 163
 164 /* -----
 165  * The AES S-box, in pure bitwise logic so that it can be run in
 166  * parallel on whole words full of bit-sliced field elements.
 167  *
 168  * Source: 'A new combinational logic minimization technique with
 169  * applications to cryptology', https://eprint.iacr.org/2009/191
 170  *
 171  * As a minor speed optimisation, I use a modified version of the
 172  * S-box which omits the additive constant 0x63, i.e. this S-box
 173  * consists of only the field inversion and linear map components.
 174  * Instead, the addition of the constant is deferred until after the
 175  * subsequent ShiftRows and MixColumns stages, so that it happens at
 176  * the same time as adding the next round key - and then we just make
 177  * it _part_ of the round key, so it doesn't cost any extra
 178  * instructions to add.
 179  *
 180  * (Obviously adding a constant to each byte commutes with ShiftRows,
 181  * which only permutes the bytes. It also commutes with MixColumns:
 182  * that's not quite so obvious, but since the effect of MixColumns is
 183  * to multiply a constant polynomial M into each column, it is obvious
 184  * that adding some polynomial K and then multiplying by M is
 185  * equivalent to multiplying by M and then adding the product KM. And
 186  * in fact, since the coefficients of M happen to sum to 1, it turns
 187  * out that KM = K, so we don't even have to change the constant when
 188  * we move it to the far side of MixColumns.)
 189  *
 190  * Of course, one knock-on effect of this is that the use of the S-box
 191  * *during* key setup has to be corrected by manually adding on the
 192  * constant afterwards!
 193  */
 194
 195 /* Initial linear transformation for the forward S-box, from Fig 2 of
 196  * the paper. */
 197 #define SBOX_FORWARD_TOP_TRANSFORM(input, uintN_t)      \
 198         uintN_t y14 = input[4] ^ input[2];              \
 199         uintN_t y13 = input[7] ^ input[1];              \
 200         uintN_t y9 = input[7] ^ input[4];               \
 201         uintN_t y8 = input[7] ^ input[2];               \
 202         uintN_t t0 = input[6] ^ input[5];               \
 203         uintN_t y1 = t0 ^ input[0];                     \
 204         uintN_t y4 = y1 ^ input[4];                     \
 205         uintN_t y12 = y13 ^ y14;                        \
 206         uintN_t y2 = y1 ^ input[7];                     \
 207         uintN_t y5 = y1 ^ input[1];                     \
 208         uintN_t y3 = y5 ^ y8;                           \
 209         uintN_t t1 = input[3] ^ y12;                    \
 210         uintN_t y15 = t1 ^ input[2];                    \
 211         uintN_t y20 = t1 ^ input[6];                    \
 212         uintN_t y6 = y15 ^ input[0];                    \
 213         uintN_t y10 = y15 ^ t0;                         \
 214         uintN_t y11 = y20 ^ y9;                         \
 215         uintN_t y7 = input[0] ^ y11;                    \
 216         uintN_t y17 = y10 ^ y11;                        \
 217         uintN_t y19 = y10 ^ y8;                         \
 218         uintN_t y16 = t0 ^ y11;                         \
 219         uintN_t y21 = y13 ^ y16;                        \
 220         uintN_t y18 = input[7] ^ y16;                   \
 221         /* Make a copy of input[0] under a new name, because the core
 222          * will refer to it, and in the inverse version of the S-box
 223          * the corresponding value will be one of the calculated ones
 224          * and not in input[0] itself. */               \
 225         uintN_t i0 = input[0];                          \
 226         /* end */
 227
 228 /* Core nonlinear component, from Fig 3 of the paper. */
 229 #define SBOX_CORE(uintN_t)                              \
 230         uintN_t t2 = y12 & y15;                         \
 231         uintN_t t3 = y3 & y6;                           \
 232         uintN_t t4 = t3 ^ t2;                           \
 233         uintN_t t5 = y4 & i0;                           \
 234         uintN_t t6 = t5 ^ t2;                           \
 235         uintN_t t7 = y13 & y16;                         \
 236         uintN_t t8 = y5 & y1;                           \
 237         uintN_t t9 = t8 ^ t7;                           \
 238         uintN_t t10 = y2 & y7;                          \
 239         uintN_t t11 = t10 ^ t7;                         \
 240         uintN_t t12 = y9 & y11;                         \
 241         uintN_t t13 = y14 & y17;                        \
 242         uintN_t t14 = t13 ^ t12;                        \
 243         uintN_t t15 = y8 & y10;                         \
 244         uintN_t t16 = t15 ^ t12;                        \
 245         uintN_t t17 = t4 ^ t14;                         \
 246         uintN_t t18 = t6 ^ t16;                         \
 247         uintN_t t19 = t9 ^ t14;                         \
 248         uintN_t t20 = t11 ^ t16;                        \
 249         uintN_t t21 = t17 ^ y20;                        \
 250         uintN_t t22 = t18 ^ y19;                        \
 251         uintN_t t23 = t19 ^ y21;                        \
 252         uintN_t t24 = t20 ^ y18;                        \
 253         uintN_t t25 = t21 ^ t22;                        \
 254         uintN_t t26 = t21 & t23;                        \
 255         uintN_t t27 = t24 ^ t26;                        \
 256         uintN_t t28 = t25 & t27;                        \
 257         uintN_t t29 = t28 ^ t22;                        \
 258         uintN_t t30 = t23 ^ t24;                        \
 259         uintN_t t31 = t22 ^ t26;                        \
 260         uintN_t t32 = t31 & t30;                        \
 261         uintN_t t33 = t32 ^ t24;                        \
 262         uintN_t t34 = t23 ^ t33;                        \
 263         uintN_t t35 = t27 ^ t33;                        \
 264         uintN_t t36 = t24 & t35;                        \
 265         uintN_t t37 = t36 ^ t34;                        \
 266         uintN_t t38 = t27 ^ t36;                        \
 267         uintN_t t39 = t29 & t38;                        \
 268         uintN_t t40 = t25 ^ t39;                        \
 269         uintN_t t41 = t40 ^ t37;                        \
 270         uintN_t t42 = t29 ^ t33;                        \
 271         uintN_t t43 = t29 ^ t40;                        \
 272         uintN_t t44 = t33 ^ t37;                        \
 273         uintN_t t45 = t42 ^ t41;                        \
 274         uintN_t z0 = t44 & y15;                         \
 275         uintN_t z1 = t37 & y6;                          \
 276         uintN_t z2 = t33 & i0;                          \
 277         uintN_t z3 = t43 & y16;                         \
 278         uintN_t z4 = t40 & y1;                          \
 279         uintN_t z5 = t29 & y7;                          \
 280         uintN_t z6 = t42 & y11;                         \
 281         uintN_t z7 = t45 & y17;                         \
 282         uintN_t z8 = t41 & y10;                         \
 283         uintN_t z9 = t44 & y12;                         \
 284         uintN_t z10 = t37 & y3;                         \
 285         uintN_t z11 = t33 & y4;                         \
 286         uintN_t z12 = t43 & y13;                        \
 287         uintN_t z13 = t40 & y5;                         \
 288         uintN_t z14 = t29 & y2;                         \
 289         uintN_t z15 = t42 & y9;                         \
 290         uintN_t z16 = t45 & y14;                        \
 291         uintN_t z17 = t41 & y8;                         \
 292         /* end */
 293
 294 /* Final linear transformation for the forward S-box, from Fig 4 of
 295  * the paper. */
 296 #define SBOX_FORWARD_BOTTOM_TRANSFORM(output, uintN_t)   \
 297         uintN_t t46 = z15 ^ z16;                        \
 298         uintN_t t47 = z10 ^ z11;                        \
 299         uintN_t t48 = z5 ^ z13;                         \
 300         uintN_t t49 = z9 ^ z10;                         \
 301         uintN_t t50 = z2 ^ z12;                         \
 302         uintN_t t51 = z2 ^ z5;                          \
 303         uintN_t t52 = z7 ^ z8;                          \
 304         uintN_t t53 = z0 ^ z3;                          \
 305         uintN_t t54 = z6 ^ z7;                          \
 306         uintN_t t55 = z16 ^ z17;                        \
 307         uintN_t t56 = z12 ^ t48;                        \
 308         uintN_t t57 = t50 ^ t53;                        \
 309         uintN_t t58 = z4 ^ t46;                         \
 310         uintN_t t59 = z3 ^ t54;                         \
 311         uintN_t t60 = t46 ^ t57;                        \
 312         uintN_t t61 = z14 ^ t57;                        \
 313         uintN_t t62 = t52 ^ t58;                        \
 314         uintN_t t63 = t49 ^ t58;                        \
 315         uintN_t t64 = z4 ^ t59;                         \
 316         uintN_t t65 = t61 ^ t62;                        \
 317         uintN_t t66 = z1 ^ t63;                         \
 318         output[7] = t59 ^ t63;                          \
 319         output[1] = t56 ^ t62;                          \
 320         output[0] = t48 ^ t60;                          \
 321         uintN_t t67 = t64 ^ t65;                        \
 322         output[4] = t53 ^ t66;                          \
 323         output[3] = t51 ^ t66;                          \
 324         output[2] = t47 ^ t65;                          \
 325         output[6] = t64 ^ output[4];                    \
 326         output[5] = t55 ^ t67;                          \
 327         /* end */
 328
 329 #define BITSLICED_SUBBYTES(output, input, uintN_t) do { \
 330         SBOX_FORWARD_TOP_TRANSFORM(input, uintN_t);      \
 331         SBOX_CORE(uintN_t);                             \
 332         SBOX_FORWARD_BOTTOM_TRANSFORM(output, uintN_t);  \
 333     } while (0)
 334
 335 /*
 336  * Initial and final linear transformations for the backward S-box. I
 337  * generated these myself, by implementing the linear-transform
 338  * optimisation algorithm in the paper, and applying it to the
 339  * matrices calculated by _their_ top and bottom transformations, pre-
 340  * and post-multiplied as appropriate by the linear map in the inverse
 341  * S_box.
 342  */
 343 #define SBOX_BACKWARD_TOP_TRANSFORM(input, uintN_t)     \
 344     uintN_t y5 = input[4] ^ input[6];                   \
 345     uintN_t y19 = input[3] ^ input[0];                  \
 346     uintN_t itmp8 = y5 ^ input[0];                      \
 347     uintN_t y4 = itmp8 ^ input[1];                      \
 348     uintN_t y9 = input[4] ^ input[3];                   \
 349     uintN_t y2 = y9 ^ y4;                               \
 350     uintN_t itmp9 = y2 ^ input[7];                      \
 351     uintN_t y1 = y9 ^ input[0];                         \
 352     uintN_t y6 = y5 ^ input[7];                         \
 353     uintN_t y18 = y9 ^ input[5];                        \
 354     uintN_t y7 = y18 ^ y2;                              \
 355     uintN_t y16 = y7 ^ y1;                              \
 356     uintN_t y21 = y7 ^ input[1];                        \
 357     uintN_t y3 = input[4] ^ input[7];                   \
 358     uintN_t y13 = y16 ^ y21;                            \
 359     uintN_t y8 = input[4] ^ y6;                         \
 360     uintN_t y10 = y8 ^ y19;                             \
 361     uintN_t y14 = y8 ^ y9;                              \
 362     uintN_t y20 = itmp9 ^ input[2];                     \
 363     uintN_t y11 = y9 ^ y20;                             \
 364     uintN_t i0 = y11 ^ y7;                              \
 365     uintN_t y15 = i0 ^ y6;                              \
 366     uintN_t y17 = y16 ^ y15;                            \
 367     uintN_t y12 = itmp9 ^ input[3];                     \
 368     /* end */
 369 #define SBOX_BACKWARD_BOTTOM_TRANSFORM(output, uintN_t) \
 370     uintN_t otmp18 = z15 ^ z6;                          \
 371     uintN_t otmp19 = z13 ^ otmp18;                      \
 372     uintN_t otmp20 = z12 ^ otmp19;                      \
 373     uintN_t otmp21 = z16 ^ otmp20;                      \
 374     uintN_t otmp22 = z8 ^ otmp21;                       \
 375     uintN_t otmp23 = z0 ^ otmp22;                       \
 376     uintN_t otmp24 = otmp22 ^ z3;                       \
 377     uintN_t otmp25 = otmp24 ^ z4;                       \
 378     uintN_t otmp26 = otmp25 ^ z2;                       \
 379     uintN_t otmp27 = z1 ^ otmp26;                       \
 380     uintN_t otmp28 = z14 ^ otmp27;                      \
 381     uintN_t otmp29 = otmp28 ^ z10;                      \
 382     output[4] = z2 ^ otmp23;                            \
 383     output[7] = z5 ^ otmp24;                            \
 384     uintN_t otmp30 = z11 ^ otmp29;                      \
 385     output[5] = z13 ^ otmp30;                           \
 386     uintN_t otmp31 = otmp25 ^ z8;                       \
 387     output[1] = z7 ^ otmp31;                            \
 388     uintN_t otmp32 = z11 ^ z9;                          \
 389     uintN_t otmp33 = z17 ^ otmp32;                      \
 390     uintN_t otmp34 = otmp30 ^ otmp33;                   \
 391     output[0] = z15 ^ otmp33;                           \
 392     uintN_t otmp35 = z12 ^ otmp34;                      \
 393     output[6] = otmp35 ^ z16;                           \
 394     uintN_t otmp36 = z1 ^ otmp23;                       \
 395     uintN_t otmp37 = z5 ^ otmp36;                       \
 396     output[2] = z4 ^ otmp37;                            \
 397     uintN_t otmp38 = z11 ^ output[1];                   \
 398     uintN_t otmp39 = z2 ^ otmp38;                       \
 399     uintN_t otmp40 = z17 ^ otmp39;                      \
 400     uintN_t otmp41 = z0 ^ otmp40;                       \
 401     uintN_t otmp42 = z5 ^ otmp41;                       \
 402     uintN_t otmp43 = otmp42 ^ z10;                      \
 403     uintN_t otmp44 = otmp43 ^ z3;                       \
 404     output[3] = otmp44 ^ z16;                           \
 405     /* end */
 406
 407 #define BITSLICED_INVSUBBYTES(output, input, uintN_t) do {      \
 408         SBOX_BACKWARD_TOP_TRANSFORM(input, uintN_t);             \
 409         SBOX_CORE(uintN_t);                                     \
 410         SBOX_BACKWARD_BOTTOM_TRANSFORM(output, uintN_t);         \
 411     } while (0)
 412
 413
 414 /* -----
 415  * The ShiftRows transformation. This operates independently on each
 416  * bit slice.
 417  */
 418
 419 #define SINGLE_BITSLICE_SHIFTROWS(output, input, uintN_t) do            \
 420     {                                                                   \
 421         uintN_t mask, mask2, mask3, diff, x = (input);                  \
 422         /* Rotate rows 2 and 3 by 16 bits */                            \
 423         mask = 0x00CC * (((uintN_t)~(uintN_t)0) / 0xFFFF);              \
 424         diff = ((x >> 8) ^ x) & mask;                                   \
 425         x ^= diff ^ (diff << 8);                                        \
 426         /* Rotate rows 1 and 3 by 8 bits */                             \
 427         mask  = 0x0AAA * (((uintN_t)~(uintN_t)0) / 0xFFFF);             \
 428         mask2 = 0xA000 * (((uintN_t)~(uintN_t)0) / 0xFFFF);             \
 429         mask3 = 0x5555 * (((uintN_t)~(uintN_t)0) / 0xFFFF);             \
 430         x = ((x >> 4) & mask) | ((x << 12) & mask2) | (x & mask3);      \
 431         /* Write output */                                              \
 432         (output) = x;                                                   \
 433     } while (0)
 434
 435 #define SINGLE_BITSLICE_INVSHIFTROWS(output, input, uintN_t) do         \
 436     {                                                                   \
 437         uintN_t mask, mask2, mask3, diff, x = (input);                  \
 438         /* Rotate rows 2 and 3 by 16 bits */                            \
 439         mask = 0x00CC * (((uintN_t)~(uintN_t)0) / 0xFFFF);              \
 440         diff = ((x >> 8) ^ x) & mask;                                   \
 441         x ^= diff ^ (diff << 8);                                        \
 442         /* Rotate rows 1 and 3 by 8 bits, the opposite way to ShiftRows */ \
 443         mask  = 0x000A * (((uintN_t)~(uintN_t)0) / 0xFFFF);             \
 444         mask2 = 0xAAA0 * (((uintN_t)~(uintN_t)0) / 0xFFFF);             \
 445         mask3 = 0x5555 * (((uintN_t)~(uintN_t)0) / 0xFFFF);             \
 446         x = ((x >> 12) & mask) | ((x << 4) & mask2) | (x & mask3);      \
 447         /* Write output */                                              \
 448         (output) = x;                                                   \
 449     } while (0)
 450
 451 #define BITSLICED_SHIFTROWS(output, input, uintN_t) do                  \
 452     {                                                                   \
 453         ITERATE(SINGLE_BITSLICE_SHIFTROWS, output, input, uintN_t);     \
 454     } while (0)
 455
 456 #define BITSLICED_INVSHIFTROWS(output, input, uintN_t) do               \
 457     {                                                                   \
 458         ITERATE(SINGLE_BITSLICE_INVSHIFTROWS, output, input, uintN_t);  \
 459     } while (0)
 460
 461 /* -----
 462  * The MixColumns transformation. This has to operate on all eight bit
 463  * slices at once, and also passes data back and forth between the
 464  * bits in an adjacent group of 4 within each slice.
 465  *
 466  * Notation: let F = GF(2)[X]/<X^8+X^4+X^3+X+1> be the finite field
 467  * used in AES, and let R = F[Y]/<Y^4+1> be the ring whose elements
 468  * represent the possible contents of a column of the matrix. I use X
 469  * and Y below in those senses, i.e. X is the value in F that
 470  * represents the byte 0x02, and Y is the value in R that cycles the
 471  * four bytes around by one if you multiply by it.
 472  */
 473
 474 /* Multiply every column by Y^3, i.e. cycle it round one place to the
 475  * right. Operates on one bit slice at a time; you have to wrap it in
 476  * ITERATE to affect all the data at once. */
 477 #define BITSLICED_MUL_BY_Y3(output, input, uintN_t) do          \
 478     {                                                           \
 479         uintN_t mask, mask2, x;                                 \
 480         mask  = 0x8 * (((uintN_t)~(uintN_t)0) / 0xF);           \
 481         mask2 = 0x7 * (((uintN_t)~(uintN_t)0) / 0xF);           \
 482         x = input;                                              \
 483         output = ((x << 3) & mask) ^ ((x >> 1) & mask2);        \
 484     } while (0)
 485
 486 /* Multiply every column by Y^2. */
 487 #define BITSLICED_MUL_BY_Y2(output, input, uintN_t) do          \
 488     {                                                           \
 489         uintN_t mask, mask2, x;                                 \
 490         mask  = 0xC * (((uintN_t)~(uintN_t)0) / 0xF);           \
 491         mask2 = 0x3 * (((uintN_t)~(uintN_t)0) / 0xF);           \
 492         x = input;                                              \
 493         output = ((x << 2) & mask) ^ ((x >> 2) & mask2);        \
 494     } while (0)
 495
 496 #define BITSLICED_MUL_BY_1_Y3(output, input, uintN_t) do        \
 497     {                                                           \
 498         uintN_t tmp = input;                                    \
 499         BITSLICED_MUL_BY_Y3(tmp, input, uintN_t);               \
 500         output = input ^ tmp;                                   \
 501     } while (0)
 502
 503 /* Multiply every column by 1+Y^2. */
 504 #define BITSLICED_MUL_BY_1_Y2(output, input, uintN_t) do        \
 505     {                                                           \
 506         uintN_t tmp = input;                                    \
 507         BITSLICED_MUL_BY_Y2(tmp, input, uintN_t);               \
 508         output = input ^ tmp;                                   \
 509     } while (0)
 510
 511 /* Multiply every field element by X. This has to feed data between
 512  * slices, so it does the whole job in one go without needing ITERATE. */
 513 #define BITSLICED_MUL_BY_X(output, input, uintN_t) do   \
 514     {                                                   \
 515         uintN_t bit7 = input[7];                        \
 516         output[7] = input[6];                           \
 517         output[6] = input[5];                           \
 518         output[5] = input[4];                           \
 519         output[4] = input[3] ^ bit7;                    \
 520         output[3] = input[2] ^ bit7;                    \
 521         output[2] = input[1];                           \
 522         output[1] = input[0] ^ bit7;                    \
 523         output[0] =            bit7;                    \
 524     } while (0)
 525
 526 /*
 527  * The MixColumns constant is
 528  *   M = X + Y + Y^2 + (X+1)Y^3
 529  * which we construct by rearranging it into
 530  *   M = 1 + (1+Y^3) [ X + (1+Y^2) ]
 531  */
 532 #define BITSLICED_MIXCOLUMNS(output, input, uintN_t) do         \
 533     {                                                           \
 534         uintN_t a[8], aX[8], b[8];                              \
 535         /* a = input * (1+Y^3) */                               \
 536         ITERATE(BITSLICED_MUL_BY_1_Y3, a, input, uintN_t);      \
 537         /* aX = a * X */                                        \
 538         BITSLICED_MUL_BY_X(aX, a, uintN_t);                     \
 539         /* b = a * (1+Y^2) = input * (1+Y+Y^2+Y^3) */           \
 540         ITERATE(BITSLICED_MUL_BY_1_Y2, b, a, uintN_t);          \
 541         /* output = input + aX + b (reusing a as a temp */      \
 542         BITSLICED_ADD(a, aX, b);                                \
 543         BITSLICED_ADD(output, input, a);                        \
 544     } while (0)
 545
 546 /*
 547  * The InvMixColumns constant, written out longhand, is
 548  *   I = (X^3+X^2+X) + (X^3+1)Y + (X^3+X^2+1)Y^2 + (X^3+X+1)Y^3
 549  * We represent this as
 550  *   I = (X^3+X^2+X+1)(Y^3+Y^2+Y+1) + 1 + X(Y+Y^2) + X^2(Y+Y^3)
 551  */
 552 #define BITSLICED_INVMIXCOLUMNS(output, input, uintN_t) do      \
 553     {                                                           \
 554         /* We need input * X^i for i=1,...,3 */                 \
 555         uintN_t X[8], X2[8], X3[8];                             \
 556         BITSLICED_MUL_BY_X(X, input, uintN_t);                  \
 557         BITSLICED_MUL_BY_X(X2, X, uintN_t);                     \
 558         BITSLICED_MUL_BY_X(X3, X2, uintN_t);                    \
 559         /* Sum them all and multiply by 1+Y+Y^2+Y^3. */         \
 560         uintN_t S[8];                                           \
 561         BITSLICED_ADD(S, input, X);                             \
 562         BITSLICED_ADD(S, S, X2);                                \
 563         BITSLICED_ADD(S, S, X3);                                \
 564         ITERATE(BITSLICED_MUL_BY_1_Y3, S, S, uintN_t);          \
 565         ITERATE(BITSLICED_MUL_BY_1_Y2, S, S, uintN_t);          \
 566         /* Compute the X(Y+Y^2) term. */                        \
 567         uintN_t A[8];                                           \
 568         ITERATE(BITSLICED_MUL_BY_1_Y3, A, X, uintN_t);          \
 569         ITERATE(BITSLICED_MUL_BY_Y2, A, A, uintN_t);            \
 570         /* Compute the X^2(Y+Y^3) term. */                      \
 571         uintN_t B[8];                                           \
 572         ITERATE(BITSLICED_MUL_BY_1_Y2, B, X2, uintN_t);         \
 573         ITERATE(BITSLICED_MUL_BY_Y3, B, B, uintN_t);            \
 574         /* And add all the pieces together. */                  \
 575         BITSLICED_ADD(S, S, input);                             \
 576         BITSLICED_ADD(S, S, A);                                 \
 577         BITSLICED_ADD(output, S, B);                            \
 578     } while (0)
 579
 580 /* -----
 581  * Put it all together into a cipher round.
 582  */
 583
 584 /* Dummy macro to get rid of the MixColumns in the final round. */
 585 #define NO_MIXCOLUMNS(out, in, uintN_t) do {} while (0)
 586
 587 #define ENCRYPT_ROUND_FN(suffix, uintN_t, mixcol_macro)                 \
 588     static void aes_sliced_round_e_##suffix(                            \
 589         uintN_t output[8], const uintN_t input[8], const uintN_t roundkey[8]) \
 590     {                                                                   \
 591         BITSLICED_SUBBYTES(output, input, uintN_t);                     \
 592         BITSLICED_SHIFTROWS(output, output, uintN_t);                   \
 593         mixcol_macro(output, output, uintN_t);                          \
 594         BITSLICED_ADD(output, output, roundkey);                        \
 595     }
 596
 597 ENCRYPT_ROUND_FN(serial, uint16_t, BITSLICED_MIXCOLUMNS)
 598 ENCRYPT_ROUND_FN(serial_last, uint16_t, NO_MIXCOLUMNS)
 599 ENCRYPT_ROUND_FN(parallel, BignumInt, BITSLICED_MIXCOLUMNS)
 600 ENCRYPT_ROUND_FN(parallel_last, BignumInt, NO_MIXCOLUMNS)
 601
 602 #define DECRYPT_ROUND_FN(suffix, uintN_t, mixcol_macro)                 \
 603     static void aes_sliced_round_d_##suffix(                            \
 604         uintN_t output[8], const uintN_t input[8], const uintN_t roundkey[8]) \
 605     {                                                                   \
 606         BITSLICED_ADD(output, input, roundkey);                         \
 607         mixcol_macro(output, output, uintN_t);                          \
 608         BITSLICED_INVSUBBYTES(output, output, uintN_t);                 \
 609         BITSLICED_INVSHIFTROWS(output, output, uintN_t);                \
 610     }
 611
 612 #if 0 /* no cipher mode we support requires serial decryption */
 613 DECRYPT_ROUND_FN(serial, uint16_t, BITSLICED_INVMIXCOLUMNS)
 614 DECRYPT_ROUND_FN(serial_first, uint16_t, NO_MIXCOLUMNS)
 615 #endif
 616 DECRYPT_ROUND_FN(parallel, BignumInt, BITSLICED_INVMIXCOLUMNS)
 617 DECRYPT_ROUND_FN(parallel_first, BignumInt, NO_MIXCOLUMNS)
 618
 619 /* -----
 620  * Key setup function.
 621  */
 622
 623 typedef struct aes_sliced_key aes_sliced_key;
 624 struct aes_sliced_key {
 625     BignumInt roundkeys_parallel[MAXROUNDKEYS * 8];
 626     uint16_t roundkeys_serial[MAXROUNDKEYS * 8];
 627     unsigned rounds;
 628 };
 629
 630 static void aes_sliced_key_setup(
 631     aes_sliced_key *sk, const void *vkey, size_t keybits)
 632 {
 633     const unsigned char *key = (const unsigned char *)vkey;
 634
 635     size_t key_words = keybits / 32;
 636     sk->rounds = key_words + 6;
 637     size_t sched_words = (sk->rounds + 1) * 4;
 638
 639     unsigned rconpos = 0;
 640
 641     uint16_t *outslices = sk->roundkeys_serial;
 642     unsigned outshift = 0;
 643
 644     memset(sk->roundkeys_serial, 0, sizeof(sk->roundkeys_serial));
 645
 646     uint8_t inblk[16];
 647     memset(inblk, 0, 16);
 648     uint16_t slices[8];
 649
 650     for (size_t i = 0; i < sched_words; i++) {
 651         /*
 652          * Prepare a word of round key in the low 4 bits of each
 653          * integer in slices[].
 654          */
 655         if (i < key_words) {
 656             memcpy(inblk, key + 4*i, 4);
 657             TO_BITSLICES(slices, inblk, uint16_t, =, 0);
 658         } else {
 659             unsigned wordindex, bitshift;
 660             uint16_t *prevslices;
 661
 662             /* Fetch the (i-1)th key word */
 663             wordindex = i-1;
 664             bitshift = 4 * (wordindex & 3);
 665             prevslices = sk->roundkeys_serial + 8 * (wordindex >> 2);
 666             for (size_t i = 0; i < 8; i++)
 667                 slices[i] = prevslices[i] >> bitshift;
 668
 669             /* Decide what we're doing in this expansion stage */
 670             bool rotate_and_round_constant = (i % key_words == 0);
 671             bool sub = rotate_and_round_constant ||
 672                 (key_words == 8 && i % 8 == 4);
 673
 674             if (rotate_and_round_constant) {
 675                 for (size_t i = 0; i < 8; i++)
 676                     slices[i] = ((slices[i] << 3) | (slices[i] >> 1)) & 0xF;
 677             }
 678
 679             if (sub) {
 680                 /* Apply the SubBytes transform to the key word. But
 681                  * here we need to apply the _full_ SubBytes from the
 682                  * spec, including the constant which our S-box leaves
 683                  * out. */
 684                 BITSLICED_SUBBYTES(slices, slices, uint16_t);
 685                 slices[0] ^= 0xFFFF;
 686                 slices[1] ^= 0xFFFF;
 687                 slices[5] ^= 0xFFFF;
 688                 slices[6] ^= 0xFFFF;
 689             }
 690
 691             if (rotate_and_round_constant) {
 692                 assert(rconpos < lenof(aes_key_setup_round_constants));
 693                 uint8_t rcon = aes_key_setup_round_constants[rconpos++];
 694                 for (size_t i = 0; i < 8; i++)
 695                     slices[i] ^= 1 & (rcon >> i);
 696             }
 697
 698             /* Combine with the (i-Nk)th key word */
 699             wordindex = i - key_words;
 700             bitshift = 4 * (wordindex & 3);
 701             prevslices = sk->roundkeys_serial + 8 * (wordindex >> 2);
 702             for (size_t i = 0; i < 8; i++)
 703                 slices[i] ^= prevslices[i] >> bitshift;
 704         }
 705
 706         /*
 707          * Now copy it into sk.
 708          */
 709         for (unsigned b = 0; b < 8; b++)
 710             outslices[b] |= (slices[b] & 0xF) << outshift;
 711         outshift += 4;
 712         if (outshift == 16) {
 713             outshift = 0;
 714             outslices += 8;
 715         }
 716     }
 717
 718     smemclr(inblk, sizeof(inblk));
 719     smemclr(slices, sizeof(slices));
 720
 721     /*
 722      * Add the S-box constant to every round key after the first one,
 723      * compensating for it being left out in the main cipher.
 724      */
 725     for (size_t i = 8; i < 8 * (sched_words/4); i += 8) {
 726         sk->roundkeys_serial[i+0] ^= 0xFFFF;
 727         sk->roundkeys_serial[i+1] ^= 0xFFFF;
 728         sk->roundkeys_serial[i+5] ^= 0xFFFF;
 729         sk->roundkeys_serial[i+6] ^= 0xFFFF;
 730     }
 731
 732     /*
 733      * Replicate that set of round keys into larger integers for the
 734      * parallel versions of the cipher.
 735      */
 736     for (size_t i = 0; i < 8 * (sched_words / 4); i++) {
 737         sk->roundkeys_parallel[i] = sk->roundkeys_serial[i] *
 738             ((BignumInt)~(BignumInt)0 / 0xFFFF);
 739     }
 740 }
 741
 742 /* -----
 743  * The full cipher primitive, including transforming the input and
 744  * output to/from bit-sliced form.
 745  */
 746
 747 #define ENCRYPT_FN(suffix, uintN_t, nblocks)                            \
 748     static void aes_sliced_e_##suffix(                                  \
 749         uint8_t *output, const uint8_t *input, const aes_sliced_key *sk) \
 750     {                                                                   \
 751         uintN_t state[8];                                               \
 752         TO_BITSLICES(state, input, uintN_t, =, 0);                      \
 753         for (unsigned i = 1; i < nblocks; i++) {                        \
 754             input += 16;                                                \
 755             TO_BITSLICES(state, input, uintN_t, |=, i*16);              \
 756         }                                                               \
 757         const uintN_t *keys = sk->roundkeys_##suffix;                   \
 758         BITSLICED_ADD(state, state, keys);                              \
 759         keys += 8;                                                      \
 760         for (unsigned i = 0; i < sk->rounds-1; i++) {                   \
 761             aes_sliced_round_e_##suffix(state, state, keys);            \
 762             keys += 8;                                                  \
 763         }                                                               \
 764         aes_sliced_round_e_##suffix##_last(state, state, keys);         \
 765         for (unsigned i = 0; i < nblocks; i++) {                        \
 766             FROM_BITSLICES(output, state, i*16);                        \
 767             output += 16;                                               \
 768         }                                                               \
 769     }
 770
 771 #define DECRYPT_FN(suffix, uintN_t, nblocks)                            \
 772     static void aes_sliced_d_##suffix(                                  \
 773         uint8_t *output, const uint8_t *input, const aes_sliced_key *sk) \
 774     {                                                                   \
 775         uintN_t state[8];                                               \
 776         TO_BITSLICES(state, input, uintN_t, =, 0);                      \
 777         for (unsigned i = 1; i < nblocks; i++) {                        \
 778             input += 16;                                                \
 779             TO_BITSLICES(state, input, uintN_t, |=, i*16);              \
 780         }                                                               \
 781         const uintN_t *keys = sk->roundkeys_##suffix + 8*sk->rounds;    \
 782         aes_sliced_round_d_##suffix##_first(state, state, keys);        \
 783         keys -= 8;                                                      \
 784         for (unsigned i = 0; i < sk->rounds-1; i++) {                   \
 785             aes_sliced_round_d_##suffix(state, state, keys);            \
 786             keys -= 8;                                                  \
 787         }                                                               \
 788         BITSLICED_ADD(state, state, keys);                              \
 789         for (unsigned i = 0; i < nblocks; i++) {                        \
 790             FROM_BITSLICES(output, state, i*16);                        \
 791             output += 16;                                               \
 792         }                                                               \
 793     }
 794
 795 ENCRYPT_FN(serial, uint16_t, 1)
 796 #if 0 /* no cipher mode we support requires serial decryption */
 797 DECRYPT_FN(serial, uint16_t, 1)
 798 #endif
 799 ENCRYPT_FN(parallel, BignumInt, SLICE_PARALLELISM)
 800 DECRYPT_FN(parallel, BignumInt, SLICE_PARALLELISM)
 801
 802 /* -----
 803  * The SSH interface and the cipher modes.
 804  */
 805
 806 #define SDCTR_WORDS (16 / BIGNUM_INT_BYTES)
 807
 808 typedef struct aes_sw_context aes_sw_context;
 809 struct aes_sw_context {
 810     aes_sliced_key sk;
 811     union {
 812         struct {
 813             /* In CBC mode, the IV is just a copy of the last seen
 814              * cipher block. */
 815             uint8_t prevblk[16];
 816         } cbc;
 817         struct {
 818             /* In SDCTR mode, we keep the counter itself in a form
 819              * that's easy to increment. We also use the parallel
 820              * version of the core AES function, so we'll encrypt
 821              * multiple counter values in one go. That won't align
 822              * nicely with the sizes of data we're asked to encrypt,
 823              * so we must also store a cache of the last set of
 824              * keystream blocks we generated, and our current position
 825              * within that cache. */
 826             BignumInt counter[SDCTR_WORDS];
 827             uint8_t keystream[SLICE_PARALLELISM * 16];
 828             uint8_t *keystream_pos;
 829         } sdctr;
 830         struct {
 831             /* In GCM mode, the cipher preimage consists of three
 832              * sections: one fixed, one that increments per message
 833              * sent and MACed, and one that increments per cipher
 834              * block. */
 835             uint64_t msg_counter;
 836             uint32_t fixed_iv, block_counter;
 837             /* But we keep the precomputed keystream chunks just like
 838              * SDCTR mode. */
 839             uint8_t keystream[SLICE_PARALLELISM * 16];
 840             uint8_t *keystream_pos;
 841         } gcm;
 842     } iv;
 843     ssh_cipher ciph;
 844 };
 845
 846 static ssh_cipher *aes_sw_new(const ssh_cipheralg *alg)
 847 {
 848     aes_sw_context *ctx = snew(aes_sw_context);
 849     ctx->ciph.vt = alg;
 850     return &ctx->ciph;
 851 }
 852
 853 static void aes_sw_free(ssh_cipher *ciph)
 854 {
 855     aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
 856     smemclr(ctx, sizeof(*ctx));
 857     sfree(ctx);
 858 }
 859
 860 static void aes_sw_setkey(ssh_cipher *ciph, const void *vkey)
 861 {
 862     aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
 863     aes_sliced_key_setup(&ctx->sk, vkey, ctx->ciph.vt->real_keybits);
 864 }
 865
 866 static void aes_sw_setiv_cbc(ssh_cipher *ciph, const void *iv)
 867 {
 868     aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
 869     memcpy(ctx->iv.cbc.prevblk, iv, 16);
 870 }
 871
 872 static void aes_sw_setiv_sdctr(ssh_cipher *ciph, const void *viv)
 873 {
 874     aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
 875     const uint8_t *iv = (const uint8_t *)viv;
 876
 877     /* Import the initial counter value into the internal representation */
 878     for (unsigned i = 0; i < SDCTR_WORDS; i++)
 879         ctx->iv.sdctr.counter[i] =
 880             GET_BIGNUMINT_MSB_FIRST(
 881                 iv + 16 - BIGNUM_INT_BYTES - i*BIGNUM_INT_BYTES);
 882
 883     /* Set keystream_pos to indicate that the keystream cache is
 884      * currently empty */
 885     ctx->iv.sdctr.keystream_pos =
 886         ctx->iv.sdctr.keystream + sizeof(ctx->iv.sdctr.keystream);
 887 }
 888
 889 static void aes_sw_setiv_gcm(ssh_cipher *ciph, const void *viv)
 890 {
 891     aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
 892     const uint8_t *iv = (const uint8_t *)viv;
 893
 894     ctx->iv.gcm.fixed_iv = GET_32BIT_MSB_FIRST(iv);
 895     ctx->iv.gcm.msg_counter = GET_64BIT_MSB_FIRST(iv + 4);
 896     ctx->iv.gcm.block_counter = 1;
 897
 898     /* Set keystream_pos to indicate that the keystream cache is
 899      * currently empty */
 900     ctx->iv.gcm.keystream_pos =
 901         ctx->iv.gcm.keystream + sizeof(ctx->iv.gcm.keystream);
 902 }
 903
 904 static void aes_sw_next_message_gcm(ssh_cipher *ciph)
 905 {
 906     aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
 907
 908     ctx->iv.gcm.msg_counter++;
 909     ctx->iv.gcm.block_counter = 1;
 910     ctx->iv.gcm.keystream_pos =
 911         ctx->iv.gcm.keystream + sizeof(ctx->iv.gcm.keystream);
 912 }
 913
 914 typedef void (*aes_sw_fn)(uint32_t v[4], const uint32_t *keysched);
 915
 916 static inline void memxor16(void *vout, const void *vlhs, const void *vrhs)
 917 {
 918     uint8_t *out = (uint8_t *)vout;
 919     const uint8_t *lhs = (const uint8_t *)vlhs, *rhs = (const uint8_t *)vrhs;
 920     uint64_t w;
 921
 922     w = GET_64BIT_LSB_FIRST(lhs);
 923     w ^= GET_64BIT_LSB_FIRST(rhs);
 924     PUT_64BIT_LSB_FIRST(out, w);
 925     w = GET_64BIT_LSB_FIRST(lhs + 8);
 926     w ^= GET_64BIT_LSB_FIRST(rhs + 8);
 927     PUT_64BIT_LSB_FIRST(out + 8, w);
 928 }
 929
 930 static inline void aes_cbc_sw_encrypt(
 931     ssh_cipher *ciph, void *vblk, int blklen)
 932 {
 933     aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
 934
 935     /*
 936      * CBC encryption has to be done serially, because the input to
 937      * each run of the cipher includes the output from the previous
 938      * run.
 939      */
 940
 941     for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
 942          blk < finish; blk += 16) {
 943         /*
 944          * We use the IV array itself as the location for the
 945          * encryption, because there's no reason not to.
 946          */
 947
 948         /* XOR the new plaintext block into the previous cipher block */
 949         memxor16(ctx->iv.cbc.prevblk, ctx->iv.cbc.prevblk, blk);
 950
 951         /* Run the cipher over the result, which leaves it
 952          * conveniently already stored in ctx->iv */
 953         aes_sliced_e_serial(
 954             ctx->iv.cbc.prevblk, ctx->iv.cbc.prevblk, &ctx->sk);
 955
 956         /* Copy it to the output location */
 957         memcpy(blk, ctx->iv.cbc.prevblk, 16);
 958     }
 959 }
 960
 961 static inline void aes_cbc_sw_decrypt(
 962     ssh_cipher *ciph, void *vblk, int blklen)
 963 {
 964     aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
 965     uint8_t *blk = (uint8_t *)vblk;
 966
 967     /*
 968      * CBC decryption can run in parallel, because all the
 969      * _ciphertext_ blocks are already available.
 970      */
 971
 972     size_t blocks_remaining = blklen / 16;
 973
 974     uint8_t data[SLICE_PARALLELISM * 16];
 975     /* Zeroing the data array is probably overcautious, but it avoids
 976      * technically undefined behaviour from leaving it uninitialised
 977      * if our very first iteration doesn't include enough cipher
 978      * blocks to populate it fully */
 979     memset(data, 0, sizeof(data));
 980
 981     while (blocks_remaining > 0) {
 982         /* Number of blocks we'll handle in this iteration. If we're
 983          * dealing with fewer than the maximum, it doesn't matter -
 984          * it's harmless to run the full parallel cipher function
 985          * anyway. */
 986         size_t blocks = (blocks_remaining < SLICE_PARALLELISM ?
 987                          blocks_remaining : SLICE_PARALLELISM);
 988
 989         /* Parallel-decrypt the input, in a separate array so we still
 990          * have the cipher stream available for XORing. */
 991         memcpy(data, blk, 16 * blocks);
 992         aes_sliced_d_parallel(data, data, &ctx->sk);
 993
 994         /* Write the output and update the IV */
 995         for (size_t i = 0; i < blocks; i++) {
 996             uint8_t *decrypted = data + 16*i;
 997             uint8_t *output = blk + 16*i;
 998
 999             memxor16(decrypted, decrypted, ctx->iv.cbc.prevblk);
1000             memcpy(ctx->iv.cbc.prevblk, output, 16);
1001             memcpy(output, decrypted, 16);
1002         }
1003
1004         /* Advance the input pointer. */
1005         blk += 16 * blocks;
1006         blocks_remaining -= blocks;
1007     }
1008
1009     smemclr(data, sizeof(data));
1010 }
1011
1012 static inline void aes_sdctr_sw(
1013     ssh_cipher *ciph, void *vblk, int blklen)
1014 {
1015     aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
1016
1017     /*
1018      * SDCTR encrypt/decrypt loops round one block at a time XORing
1019      * the keystream into the user's data, and periodically has to run
1020      * a parallel encryption operation to get more keystream.
1021      */
1022
1023     uint8_t *keystream_end =
1024         ctx->iv.sdctr.keystream + sizeof(ctx->iv.sdctr.keystream);
1025
1026     for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
1027          blk < finish; blk += 16) {
1028
1029         if (ctx->iv.sdctr.keystream_pos == keystream_end) {
1030             /*
1031              * Generate some keystream.
1032              */
1033             for (uint8_t *block = ctx->iv.sdctr.keystream;
1034                  block < keystream_end; block += 16) {
1035                 /* Format the counter value into the buffer. */
1036                 for (unsigned i = 0; i < SDCTR_WORDS; i++)
1037                     PUT_BIGNUMINT_MSB_FIRST(
1038                         block + 16 - BIGNUM_INT_BYTES - i*BIGNUM_INT_BYTES,
1039                         ctx->iv.sdctr.counter[i]);
1040
1041                 /* Increment the counter. */
1042                 BignumCarry carry = 1;
1043                 for (unsigned i = 0; i < SDCTR_WORDS; i++)
1044                     BignumADC(ctx->iv.sdctr.counter[i], carry,
1045                               ctx->iv.sdctr.counter[i], 0, carry);
1046             }
1047
1048             /* Encrypt all those counter blocks. */
1049             aes_sliced_e_parallel(ctx->iv.sdctr.keystream,
1050                                   ctx->iv.sdctr.keystream, &ctx->sk);
1051
1052             /* Reset keystream_pos to the start of the buffer. */
1053             ctx->iv.sdctr.keystream_pos = ctx->iv.sdctr.keystream;
1054         }
1055
1056         memxor16(blk, blk, ctx->iv.sdctr.keystream_pos);
1057         ctx->iv.sdctr.keystream_pos += 16;
1058     }
1059 }
1060
1061 static inline void aes_encrypt_ecb_block_sw(ssh_cipher *ciph, void *blk)
1062 {
1063     aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
1064     aes_sliced_e_serial(blk, blk, &ctx->sk);
1065 }
1066
1067 static inline void aes_gcm_sw(
1068     ssh_cipher *ciph, void *vblk, int blklen)
1069 {
1070     aes_sw_context *ctx = container_of(ciph, aes_sw_context, ciph);
1071
1072     /*
1073      * GCM encrypt/decrypt looks just like SDCTR, except that the
1074      * method of generating more keystream varies slightly.
1075      */
1076
1077     uint8_t *keystream_end =
1078         ctx->iv.gcm.keystream + sizeof(ctx->iv.gcm.keystream);
1079
1080     for (uint8_t *blk = (uint8_t *)vblk, *finish = blk + blklen;
1081          blk < finish; blk += 16) {
1082
1083         if (ctx->iv.gcm.keystream_pos == keystream_end) {
1084             /*
1085              * Generate some keystream.
1086              */
1087             for (uint8_t *block = ctx->iv.gcm.keystream;
1088                  block < keystream_end; block += 16) {
1089                 /* Format the counter value into the buffer. */
1090                 PUT_32BIT_MSB_FIRST(block, ctx->iv.gcm.fixed_iv);
1091                 PUT_64BIT_MSB_FIRST(block + 4, ctx->iv.gcm.msg_counter);
1092                 PUT_32BIT_MSB_FIRST(block + 12, ctx->iv.gcm.block_counter);
1093
1094                 /* Increment the counter. */
1095                 ctx->iv.gcm.block_counter++;
1096             }
1097
1098             /* Encrypt all those counter blocks. */
1099             aes_sliced_e_parallel(ctx->iv.gcm.keystream,
1100                                   ctx->iv.gcm.keystream, &ctx->sk);
1101
1102             /* Reset keystream_pos to the start of the buffer. */
1103             ctx->iv.gcm.keystream_pos = ctx->iv.gcm.keystream;
1104         }
1105
1106         memxor16(blk, blk, ctx->iv.gcm.keystream_pos);
1107         ctx->iv.gcm.keystream_pos += 16;
1108     }
1109 }
1110
1111 #define SW_ENC_DEC(len)                                 \
1112     static void aes##len##_sw_cbc_encrypt(              \
1113         ssh_cipher *ciph, void *vblk, int blklen)       \
1114     { aes_cbc_sw_encrypt(ciph, vblk, blklen); }         \
1115     static void aes##len##_sw_cbc_decrypt(              \
1116         ssh_cipher *ciph, void *vblk, int blklen)       \
1117     { aes_cbc_sw_decrypt(ciph, vblk, blklen); }         \
1118     static void aes##len##_sw_sdctr(                    \
1119         ssh_cipher *ciph, void *vblk, int blklen)       \
1120     { aes_sdctr_sw(ciph, vblk, blklen); }               \
1121     static void aes##len##_sw_gcm(                      \
1122         ssh_cipher *ciph, void *vblk, int blklen)       \
1123     { aes_gcm_sw(ciph, vblk, blklen); }                 \
1124     static void aes##len##_sw_encrypt_ecb_block(        \
1125         ssh_cipher *ciph, void *vblk)                   \
1126     { aes_encrypt_ecb_block_sw(ciph, vblk); }
1127
1128 SW_ENC_DEC(128)
1129 SW_ENC_DEC(192)
1130 SW_ENC_DEC(256)
1131
1132 AES_EXTRA(_sw);
1133 AES_ALL_VTABLES(_sw, "unaccelerated");