drivers/staging/skein/skein_block.c

   1 /***********************************************************************
   2 **
   3 ** Implementation of the Skein block functions.
   4 **
   5 ** Source code author: Doug Whiting, 2008.
   6 **
   7 ** This algorithm and source code is released to the public domain.
   8 **
   9 ** Compile-time switches:
  10 **
  11 **  SKEIN_USE_ASM  -- set bits (256/512/1024) to select which
  12 **                    versions use ASM code for block processing
  13 **                    [default: use C for all block sizes]
  14 **
  15 ************************************************************************/
  16
  17 #include <linux/string.h>
  18 #include <skein.h>
  19
  20 #ifndef SKEIN_USE_ASM
  21 #define SKEIN_USE_ASM   (0)                     /* default is all C code (no ASM) */
  22 #endif
  23
  24 #ifndef SKEIN_LOOP
  25 #define SKEIN_LOOP 001                          /* default: unroll 256 and 512, but not 1024 */
  26 #endif
  27
  28 #define BLK_BITS        (WCNT*64)               /* some useful definitions for code here */
  29 #define KW_TWK_BASE     (0)
  30 #define KW_KEY_BASE     (3)
  31 #define ks              (kw + KW_KEY_BASE)
  32 #define ts              (kw + KW_TWK_BASE)
  33
  34 #ifdef SKEIN_DEBUG
  35 #define DebugSaveTweak(ctx) { ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; }
  36 #else
  37 #define DebugSaveTweak(ctx)
  38 #endif
  39
  40 /*****************************  Skein_256 ******************************/
  41 #if !(SKEIN_USE_ASM & 256)
  42 void Skein_256_Process_Block(struct skein_256_ctx *ctx, const u8 *blkPtr, size_t blkCnt, size_t byteCntAdd)
  43         { /* do it in C */
  44         enum {
  45                 WCNT = SKEIN_256_STATE_WORDS
  46         };
  47 #undef  RCNT
  48 #define RCNT  (SKEIN_256_ROUNDS_TOTAL/8)
  49
  50 #ifdef SKEIN_LOOP                              /* configure how much to unroll the loop */
  51 #define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10)
  52 #else
  53 #define SKEIN_UNROLL_256 (0)
  54 #endif
  55
  56 #if SKEIN_UNROLL_256
  57 #if (RCNT % SKEIN_UNROLL_256)
  58 #error "Invalid SKEIN_UNROLL_256"               /* sanity check on unroll count */
  59 #endif
  60         size_t  r;
  61         u64  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
  62 #else
  63         u64  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
  64 #endif
  65         u64  X0, X1, X2, X3;                        /* local copy of context vars, for speed */
  66         u64  w[WCNT];                           /* local copy of input block */
  67 #ifdef SKEIN_DEBUG
  68         const u64 *Xptr[4];                      /* use for debugging (help compiler put Xn in registers) */
  69         Xptr[0] = &X0;  Xptr[1] = &X1;  Xptr[2] = &X2;  Xptr[3] = &X3;
  70 #endif
  71         Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
  72         ts[0] = ctx->h.T[0];
  73         ts[1] = ctx->h.T[1];
  74         do  {
  75                 /* this implementation only supports 2**64 input bytes (no carry out here) */
  76                 ts[0] += byteCntAdd;                    /* update processed length */
  77
  78                 /* precompute the key schedule for this block */
  79                 ks[0] = ctx->X[0];
  80                 ks[1] = ctx->X[1];
  81                 ks[2] = ctx->X[2];
  82                 ks[3] = ctx->X[3];
  83                 ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
  84
  85                 ts[2] = ts[0] ^ ts[1];
  86
  87                 Skein_Get64_LSB_First(w, blkPtr, WCNT);   /* get input block in little-endian format */
  88                 DebugSaveTweak(ctx);
  89                 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
  90
  91                 X0 = w[0] + ks[0];                      /* do the first full key injection */
  92                 X1 = w[1] + ks[1] + ts[0];
  93                 X2 = w[2] + ks[2] + ts[1];
  94                 X3 = w[3] + ks[3];
  95
  96                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr);    /* show starting state values */
  97
  98                 blkPtr += SKEIN_256_BLOCK_BYTES;
  99
 100                 /* run the rounds */
 101
 102 #define Round256(p0, p1, p2, p3, ROT, rNum)                              \
 103         X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
 104         X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
 105
 106 #if SKEIN_UNROLL_256 == 0
 107 #define R256(p0, p1, p2, p3, ROT, rNum)           /* fully unrolled */   \
 108         Round256(p0, p1, p2, p3, ROT, rNum)                                  \
 109         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
 110
 111 #define I256(R)                                                     \
 112         X0   += ks[((R)+1) % 5];    /* inject the key schedule value */ \
 113         X1   += ks[((R)+2) % 5] + ts[((R)+1) % 3];                      \
 114         X2   += ks[((R)+3) % 5] + ts[((R)+2) % 3];                      \
 115         X3   += ks[((R)+4) % 5] +     (R)+1;                            \
 116         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 117 #else                                       /* looping version */
 118 #define R256(p0, p1, p2, p3, ROT, rNum)                                  \
 119         Round256(p0, p1, p2, p3, ROT, rNum)                                  \
 120         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
 121
 122 #define I256(R)                                                     \
 123         X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
 124         X1   += ks[r+(R)+1] + ts[r+(R)+0];                              \
 125         X2   += ks[r+(R)+2] + ts[r+(R)+1];                              \
 126         X3   += ks[r+(R)+3] +    r+(R);                              \
 127         ks[r + (R) + 4]   = ks[r + (R) - 1];     /* rotate key schedule */\
 128         ts[r + (R) + 2]   = ts[r + (R) - 1];                              \
 129         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 130
 131         for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256)  /* loop thru it */
 132 #endif
 133                 {
 134 #define R256_8_rounds(R)                  \
 135                 R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1);  \
 136                 R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2);  \
 137                 R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3);  \
 138                 R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4);  \
 139                 I256(2 * (R));                      \
 140                 R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5);  \
 141                 R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6);  \
 142                 R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7);  \
 143                 R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8);  \
 144                 I256(2 * (R) + 1);
 145
 146                 R256_8_rounds(0);
 147
 148 #define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN)))
 149
 150         #if   R256_Unroll_R(1)
 151                 R256_8_rounds(1);
 152         #endif
 153         #if   R256_Unroll_R(2)
 154                 R256_8_rounds(2);
 155         #endif
 156         #if   R256_Unroll_R(3)
 157                 R256_8_rounds(3);
 158         #endif
 159         #if   R256_Unroll_R(4)
 160                 R256_8_rounds(4);
 161         #endif
 162         #if   R256_Unroll_R(5)
 163                 R256_8_rounds(5);
 164         #endif
 165         #if   R256_Unroll_R(6)
 166                 R256_8_rounds(6);
 167         #endif
 168         #if   R256_Unroll_R(7)
 169                 R256_8_rounds(7);
 170         #endif
 171         #if   R256_Unroll_R(8)
 172                 R256_8_rounds(8);
 173         #endif
 174         #if   R256_Unroll_R(9)
 175                 R256_8_rounds(9);
 176         #endif
 177         #if   R256_Unroll_R(10)
 178                 R256_8_rounds(10);
 179         #endif
 180         #if   R256_Unroll_R(11)
 181                 R256_8_rounds(11);
 182         #endif
 183         #if   R256_Unroll_R(12)
 184                 R256_8_rounds(12);
 185         #endif
 186         #if   R256_Unroll_R(13)
 187                 R256_8_rounds(13);
 188         #endif
 189         #if   R256_Unroll_R(14)
 190                 R256_8_rounds(14);
 191         #endif
 192         #if  (SKEIN_UNROLL_256 > 14)
 193 #error  "need more unrolling in Skein_256_Process_Block"
 194         #endif
 195                 }
 196                 /* do the final "feedforward" xor, update context chaining vars */
 197                 ctx->X[0] = X0 ^ w[0];
 198                 ctx->X[1] = X1 ^ w[1];
 199                 ctx->X[2] = X2 ^ w[2];
 200                 ctx->X[3] = X3 ^ w[3];
 201
 202                 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 203
 204                 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
 205         }
 206         while (--blkCnt);
 207         ctx->h.T[0] = ts[0];
 208         ctx->h.T[1] = ts[1];
 209 }
 210
 211 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 212 size_t Skein_256_Process_Block_CodeSize(void)
 213 {
 214         return ((u8 *) Skein_256_Process_Block_CodeSize) -
 215                 ((u8 *) Skein_256_Process_Block);
 216 }
 217 unsigned int Skein_256_Unroll_Cnt(void)
 218 {
 219         return SKEIN_UNROLL_256;
 220 }
 221 #endif
 222 #endif
 223
 224 /*****************************  Skein_512 ******************************/
 225 #if !(SKEIN_USE_ASM & 512)
 226 void Skein_512_Process_Block(struct skein_512_ctx *ctx, const u8 *blkPtr, size_t blkCnt, size_t byteCntAdd)
 227 { /* do it in C */
 228         enum {
 229                 WCNT = SKEIN_512_STATE_WORDS
 230         };
 231 #undef  RCNT
 232 #define RCNT  (SKEIN_512_ROUNDS_TOTAL/8)
 233
 234 #ifdef SKEIN_LOOP                              /* configure how much to unroll the loop */
 235 #define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10)
 236 #else
 237 #define SKEIN_UNROLL_512 (0)
 238 #endif
 239
 240 #if SKEIN_UNROLL_512
 241 #if (RCNT % SKEIN_UNROLL_512)
 242 #error "Invalid SKEIN_UNROLL_512"               /* sanity check on unroll count */
 243 #endif
 244         size_t  r;
 245         u64  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
 246 #else
 247         u64  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
 248 #endif
 249         u64  X0, X1, X2, X3, X4, X5, X6, X7;            /* local copy of vars,  for speed */
 250         u64  w[WCNT];                           /* local copy of input block */
 251 #ifdef SKEIN_DEBUG
 252         const u64 *Xptr[8];                      /* use for debugging (help compiler put Xn in registers) */
 253         Xptr[0] = &X0;  Xptr[1] = &X1;  Xptr[2] = &X2;  Xptr[3] = &X3;
 254         Xptr[4] = &X4;  Xptr[5] = &X5;  Xptr[6] = &X6;  Xptr[7] = &X7;
 255 #endif
 256
 257         Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
 258         ts[0] = ctx->h.T[0];
 259         ts[1] = ctx->h.T[1];
 260         do  {
 261                 /* this implementation only supports 2**64 input bytes (no carry out here) */
 262                 ts[0] += byteCntAdd;                    /* update processed length */
 263
 264                 /* precompute the key schedule for this block */
 265                 ks[0] = ctx->X[0];
 266                 ks[1] = ctx->X[1];
 267                 ks[2] = ctx->X[2];
 268                 ks[3] = ctx->X[3];
 269                 ks[4] = ctx->X[4];
 270                 ks[5] = ctx->X[5];
 271                 ks[6] = ctx->X[6];
 272                 ks[7] = ctx->X[7];
 273                 ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
 274                         ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
 275
 276                 ts[2] = ts[0] ^ ts[1];
 277
 278                 Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */
 279                 DebugSaveTweak(ctx);
 280                 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
 281
 282                 X0   = w[0] + ks[0];                    /* do the first full key injection */
 283                 X1   = w[1] + ks[1];
 284                 X2   = w[2] + ks[2];
 285                 X3   = w[3] + ks[3];
 286                 X4   = w[4] + ks[4];
 287                 X5   = w[5] + ks[5] + ts[0];
 288                 X6   = w[6] + ks[6] + ts[1];
 289                 X7   = w[7] + ks[7];
 290
 291                 blkPtr += SKEIN_512_BLOCK_BYTES;
 292
 293                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr);
 294                 /* run the rounds */
 295 #define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)                  \
 296                 X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
 297                 X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
 298                 X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4; \
 299                 X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6; \
 300
 301 #if SKEIN_UNROLL_512 == 0
 302 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)      /* unrolled */  \
 303                 Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)                      \
 304                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
 305
 306 #define I512(R)                                                     \
 307                 X0   += ks[((R) + 1) % 9];   /* inject the key schedule value */  \
 308                 X1   += ks[((R) + 2) % 9];                                        \
 309                 X2   += ks[((R) + 3) % 9];                                        \
 310                 X3   += ks[((R) + 4) % 9];                                        \
 311                 X4   += ks[((R) + 5) % 9];                                        \
 312                 X5   += ks[((R) + 6) % 9] + ts[((R) + 1) % 3];                      \
 313                 X6   += ks[((R) + 7) % 9] + ts[((R) + 2) % 3];                      \
 314                 X7   += ks[((R) + 8) % 9] +     (R) + 1;                            \
 315                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 316 #else                                       /* looping version */
 317 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)                      \
 318                 Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)                      \
 319                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
 320
 321 #define I512(R)                                                     \
 322                 X0   += ks[r + (R) + 0];        /* inject the key schedule value */ \
 323                 X1   += ks[r + (R) + 1];                                            \
 324                 X2   += ks[r + (R) + 2];                                            \
 325                 X3   += ks[r + (R) + 3];                                            \
 326                 X4   += ks[r + (R) + 4];                                            \
 327                 X5   += ks[r + (R) + 5] + ts[r + (R) + 0];                              \
 328                 X6   += ks[r + (R) + 6] + ts[r + (R) + 1];                              \
 329                 X7   += ks[r + (R) + 7] +         r + (R);                              \
 330                 ks[r +         (R) + 8] = ks[r + (R) - 1];  /* rotate key schedule */   \
 331                 ts[r +         (R) + 2] = ts[r + (R) - 1];                              \
 332                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 333
 334                 for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512)   /* loop thru it */
 335 #endif                         /* end of looped code definitions */
 336                 {
 337 #define R512_8_rounds(R)  /* do 8 full rounds */  \
 338                         R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1);   \
 339                         R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2);   \
 340                         R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3);   \
 341                         R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4);   \
 342                         I512(2 * (R));                              \
 343                         R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5);   \
 344                         R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6);   \
 345                         R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7);   \
 346                         R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8);   \
 347                         I512(2 * (R) + 1);        /* and key injection */
 348
 349                         R512_8_rounds(0);
 350
 351 #define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN)))
 352
 353         #if   R512_Unroll_R(1)
 354                         R512_8_rounds(1);
 355         #endif
 356         #if   R512_Unroll_R(2)
 357                         R512_8_rounds(2);
 358         #endif
 359         #if   R512_Unroll_R(3)
 360                         R512_8_rounds(3);
 361         #endif
 362         #if   R512_Unroll_R(4)
 363                         R512_8_rounds(4);
 364         #endif
 365         #if   R512_Unroll_R(5)
 366                         R512_8_rounds(5);
 367         #endif
 368         #if   R512_Unroll_R(6)
 369                         R512_8_rounds(6);
 370         #endif
 371         #if   R512_Unroll_R(7)
 372                         R512_8_rounds(7);
 373         #endif
 374         #if   R512_Unroll_R(8)
 375                         R512_8_rounds(8);
 376         #endif
 377         #if   R512_Unroll_R(9)
 378                         R512_8_rounds(9);
 379         #endif
 380         #if   R512_Unroll_R(10)
 381                         R512_8_rounds(10);
 382         #endif
 383         #if   R512_Unroll_R(11)
 384                         R512_8_rounds(11);
 385         #endif
 386         #if   R512_Unroll_R(12)
 387                         R512_8_rounds(12);
 388         #endif
 389         #if   R512_Unroll_R(13)
 390                         R512_8_rounds(13);
 391         #endif
 392         #if   R512_Unroll_R(14)
 393                         R512_8_rounds(14);
 394         #endif
 395         #if  (SKEIN_UNROLL_512 > 14)
 396 #error  "need more unrolling in Skein_512_Process_Block"
 397         #endif
 398                 }
 399
 400                 /* do the final "feedforward" xor, update context chaining vars */
 401                 ctx->X[0] = X0 ^ w[0];
 402                 ctx->X[1] = X1 ^ w[1];
 403                 ctx->X[2] = X2 ^ w[2];
 404                 ctx->X[3] = X3 ^ w[3];
 405                 ctx->X[4] = X4 ^ w[4];
 406                 ctx->X[5] = X5 ^ w[5];
 407                 ctx->X[6] = X6 ^ w[6];
 408                 ctx->X[7] = X7 ^ w[7];
 409                 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 410
 411                 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
 412         }
 413         while (--blkCnt);
 414         ctx->h.T[0] = ts[0];
 415         ctx->h.T[1] = ts[1];
 416 }
 417
 418 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 419 size_t Skein_512_Process_Block_CodeSize(void)
 420 {
 421         return ((u8 *) Skein_512_Process_Block_CodeSize) -
 422                 ((u8 *) Skein_512_Process_Block);
 423 }
 424 unsigned int Skein_512_Unroll_Cnt(void)
 425 {
 426         return SKEIN_UNROLL_512;
 427 }
 428 #endif
 429 #endif
 430
 431 /*****************************  Skein1024 ******************************/
 432 #if !(SKEIN_USE_ASM & 1024)
 433 void Skein1024_Process_Block(struct skein1024_ctx *ctx, const u8 *blkPtr, size_t blkCnt, size_t byteCntAdd)
 434 { /* do it in C, always looping (unrolled is bigger AND slower!) */
 435         enum {
 436                 WCNT = SKEIN1024_STATE_WORDS
 437         };
 438 #undef  RCNT
 439 #define RCNT  (SKEIN1024_ROUNDS_TOTAL/8)
 440
 441 #ifdef SKEIN_LOOP                              /* configure how much to unroll the loop */
 442 #define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
 443 #else
 444 #define SKEIN_UNROLL_1024 (0)
 445 #endif
 446
 447 #if (SKEIN_UNROLL_1024 != 0)
 448 #if (RCNT % SKEIN_UNROLL_1024)
 449 #error "Invalid SKEIN_UNROLL_1024"              /* sanity check on unroll count */
 450 #endif
 451         size_t  r;
 452         u64  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
 453 #else
 454         u64  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
 455 #endif
 456
 457         u64  X00, X01, X02, X03, X04, X05, X06, X07,     /* local copy of vars, for speed */
 458                 X08, X09, X10, X11, X12, X13, X14, X15;
 459         u64  w[WCNT];                            /* local copy of input block */
 460 #ifdef SKEIN_DEBUG
 461         const u64 *Xptr[16];                     /* use for debugging (help compiler put Xn in registers) */
 462         Xptr[0]  = &X00;  Xptr[1]  = &X01;  Xptr[2]  = &X02;  Xptr[3]  = &X03;
 463         Xptr[4]  = &X04;  Xptr[5]  = &X05;  Xptr[6]  = &X06;  Xptr[7]  = &X07;
 464         Xptr[8]  = &X08;  Xptr[9]  = &X09;  Xptr[10] = &X10;  Xptr[11] = &X11;
 465         Xptr[12] = &X12;  Xptr[13] = &X13;  Xptr[14] = &X14;  Xptr[15] = &X15;
 466 #endif
 467
 468         Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
 469         ts[0] = ctx->h.T[0];
 470         ts[1] = ctx->h.T[1];
 471         do  {
 472                 /* this implementation only supports 2**64 input bytes (no carry out here) */
 473                 ts[0] += byteCntAdd;                    /* update processed length */
 474
 475                 /* precompute the key schedule for this block */
 476                 ks[0]  = ctx->X[0];
 477                 ks[1]  = ctx->X[1];
 478                 ks[2]  = ctx->X[2];
 479                 ks[3]  = ctx->X[3];
 480                 ks[4]  = ctx->X[4];
 481                 ks[5]  = ctx->X[5];
 482                 ks[6]  = ctx->X[6];
 483                 ks[7]  = ctx->X[7];
 484                 ks[8]  = ctx->X[8];
 485                 ks[9]  = ctx->X[9];
 486                 ks[10] = ctx->X[10];
 487                 ks[11] = ctx->X[11];
 488                 ks[12] = ctx->X[12];
 489                 ks[13] = ctx->X[13];
 490                 ks[14] = ctx->X[14];
 491                 ks[15] = ctx->X[15];
 492                 ks[16] =  ks[0] ^  ks[1] ^  ks[2] ^  ks[3] ^
 493                           ks[4] ^  ks[5] ^  ks[6] ^  ks[7] ^
 494                           ks[8] ^  ks[9] ^ ks[10] ^ ks[11] ^
 495                           ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
 496
 497                 ts[2]  = ts[0] ^ ts[1];
 498
 499                 Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */
 500                 DebugSaveTweak(ctx);
 501                 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
 502
 503                 X00    =  w[0] +  ks[0];                 /* do the first full key injection */
 504                 X01    =  w[1] +  ks[1];
 505                 X02    =  w[2] +  ks[2];
 506                 X03    =  w[3] +  ks[3];
 507                 X04    =  w[4] +  ks[4];
 508                 X05    =  w[5] +  ks[5];
 509                 X06    =  w[6] +  ks[6];
 510                 X07    =  w[7] +  ks[7];
 511                 X08    =  w[8] +  ks[8];
 512                 X09    =  w[9] +  ks[9];
 513                 X10    = w[10] + ks[10];
 514                 X11    = w[11] + ks[11];
 515                 X12    = w[12] + ks[12];
 516                 X13    = w[13] + ks[13] + ts[0];
 517                 X14    = w[14] + ks[14] + ts[1];
 518                 X15    = w[15] + ks[15];
 519
 520                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr);
 521
 522 #define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rNum) \
 523                 X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;   \
 524                 X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;   \
 525                 X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;   \
 526                 X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;   \
 527                 X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;   \
 528                 X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;   \
 529                 X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;   \
 530                 X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE;   \
 531
 532 #if SKEIN_UNROLL_1024 == 0
 533 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
 534                 Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
 535                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr);
 536
 537 #define I1024(R)                                                        \
 538                 X00   += ks[((R) +  1) % 17]; /* inject the key schedule value */   \
 539                 X01   += ks[((R) +  2) % 17];                                       \
 540                 X02   += ks[((R) +  3) % 17];                                       \
 541                 X03   += ks[((R) +  4) % 17];                                       \
 542                 X04   += ks[((R) +  5) % 17];                                       \
 543                 X05   += ks[((R) +  6) % 17];                                       \
 544                 X06   += ks[((R) +  7) % 17];                                       \
 545                 X07   += ks[((R) +  8) % 17];                                       \
 546                 X08   += ks[((R) +  9) % 17];                                       \
 547                 X09   += ks[((R) + 10) % 17];                                       \
 548                 X10   += ks[((R) + 11) % 17];                                       \
 549                 X11   += ks[((R) + 12) % 17];                                       \
 550                 X12   += ks[((R) + 13) % 17];                                       \
 551                 X13   += ks[((R) + 14) % 17] + ts[((R) + 1) % 3];                   \
 552                 X14   += ks[((R) + 15) % 17] + ts[((R) + 2) % 3];                   \
 553                 X15   += ks[((R) + 16) % 17] +     (R) + 1;                         \
 554                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 555 #else                                       /* looping version */
 556 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
 557                 Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
 558                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr);
 559
 560 #define I1024(R)                                                      \
 561                 X00   += ks[r + (R) +  0];    /* inject the key schedule value */     \
 562                 X01   += ks[r + (R) +  1];                                            \
 563                 X02   += ks[r + (R) +  2];                                            \
 564                 X03   += ks[r + (R) +  3];                                            \
 565                 X04   += ks[r + (R) +  4];                                            \
 566                 X05   += ks[r + (R) +  5];                                            \
 567                 X06   += ks[r + (R) +  6];                                            \
 568                 X07   += ks[r + (R) +  7];                                            \
 569                 X08   += ks[r + (R) +  8];                                            \
 570                 X09   += ks[r + (R) +  9];                                            \
 571                 X10   += ks[r + (R) + 10];                                            \
 572                 X11   += ks[r + (R) + 11];                                            \
 573                 X12   += ks[r + (R) + 12];                                            \
 574                 X13   += ks[r + (R) + 13] + ts[r + (R) + 0];                          \
 575                 X14   += ks[r + (R) + 14] + ts[r + (R) + 1];                          \
 576                 X15   += ks[r + (R) + 15] +         r + (R);                          \
 577                 ks[r  +         (R) + 16] = ks[r + (R) - 1]; /* rotate key schedule */\
 578                 ts[r  +         (R) +  2] = ts[r + (R) - 1];                          \
 579                 Skein_Show_R_Ptr(BLK_BITSi, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 580
 581                 for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024)    /* loop thru it */
 582 #endif
 583                 {
 584 #define R1024_8_rounds(R)    /* do 8 full rounds */                               \
 585                         R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, R1024_0, 8*(R) + 1); \
 586                         R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, 08, 01, R1024_1, 8*(R) + 2); \
 587                         R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, 10, 09, R1024_2, 8*(R) + 3); \
 588                         R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, 12, 07, R1024_3, 8*(R) + 4); \
 589                         I1024(2*(R));                                                             \
 590                         R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, R1024_4, 8*(R) + 5); \
 591                         R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, 08, 01, R1024_5, 8*(R) + 6); \
 592                         R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, 10, 09, R1024_6, 8*(R) + 7); \
 593                         R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, 12, 07, R1024_7, 8*(R) + 8); \
 594                         I1024(2*(R)+1);
 595
 596                         R1024_8_rounds(0);
 597
 598 #define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_1024 > (NN)))
 599
 600         #if   R1024_Unroll_R(1)
 601                         R1024_8_rounds(1);
 602         #endif
 603         #if   R1024_Unroll_R(2)
 604                         R1024_8_rounds(2);
 605         #endif
 606         #if   R1024_Unroll_R(3)
 607                         R1024_8_rounds(3);
 608         #endif
 609         #if   R1024_Unroll_R(4)
 610                         R1024_8_rounds(4);
 611         #endif
 612         #if   R1024_Unroll_R(5)
 613                         R1024_8_rounds(5);
 614         #endif
 615         #if   R1024_Unroll_R(6)
 616                         R1024_8_rounds(6);
 617         #endif
 618         #if   R1024_Unroll_R(7)
 619                         R1024_8_rounds(7);
 620         #endif
 621         #if   R1024_Unroll_R(8)
 622                         R1024_8_rounds(8);
 623         #endif
 624         #if   R1024_Unroll_R(9)
 625                         R1024_8_rounds(9);
 626         #endif
 627         #if   R1024_Unroll_R(10)
 628                         R1024_8_rounds(10);
 629         #endif
 630         #if   R1024_Unroll_R(11)
 631                         R1024_8_rounds(11);
 632         #endif
 633         #if   R1024_Unroll_R(12)
 634                         R1024_8_rounds(12);
 635         #endif
 636         #if   R1024_Unroll_R(13)
 637                         R1024_8_rounds(13);
 638         #endif
 639         #if   R1024_Unroll_R(14)
 640                         R1024_8_rounds(14);
 641         #endif
 642 #if  (SKEIN_UNROLL_1024 > 14)
 643 #error  "need more unrolling in Skein_1024_Process_Block"
 644   #endif
 645                 }
 646                 /* do the final "feedforward" xor, update context chaining vars */
 647
 648                 ctx->X[0] = X00 ^ w[0];
 649                 ctx->X[1] = X01 ^ w[1];
 650                 ctx->X[2] = X02 ^ w[2];
 651                 ctx->X[3] = X03 ^ w[3];
 652                 ctx->X[4] = X04 ^ w[4];
 653                 ctx->X[5] = X05 ^ w[5];
 654                 ctx->X[6] = X06 ^ w[6];
 655                 ctx->X[7] = X07 ^ w[7];
 656                 ctx->X[8] = X08 ^ w[8];
 657                 ctx->X[9] = X09 ^ w[9];
 658                 ctx->X[10] = X10 ^ w[10];
 659                 ctx->X[11] = X11 ^ w[11];
 660                 ctx->X[12] = X12 ^ w[12];
 661                 ctx->X[13] = X13 ^ w[13];
 662                 ctx->X[14] = X14 ^ w[14];
 663                 ctx->X[15] = X15 ^ w[15];
 664
 665                 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 666
 667                 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
 668                 blkPtr += SKEIN1024_BLOCK_BYTES;
 669         }
 670         while (--blkCnt);
 671         ctx->h.T[0] = ts[0];
 672         ctx->h.T[1] = ts[1];
 673 }
 674
 675 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 676 size_t Skein1024_Process_Block_CodeSize(void)
 677 {
 678         return ((u8 *) Skein1024_Process_Block_CodeSize) -
 679                 ((u8 *) Skein1024_Process_Block);
 680 }
 681 unsigned int Skein1024_Unroll_Cnt(void)
 682 {
 683         return SKEIN_UNROLL_1024;
 684 }
 685 #endif
 686 #endif