1 /***********************************************************************
3 ** Implementation of the Skein block functions.
5 ** Source code author: Doug Whiting, 2008.
7 ** This algorithm and source code is released to the public domain.
9 ** Compile-time switches:
11 ** SKEIN_USE_ASM -- set bits (256/512/1024) to select which
12 ** versions use ASM code for block processing
13 ** [default: use C for all block sizes]
15 ************************************************************************/
17 #include <linux/string.h>
21 #define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */
25 #define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */
28 #define BLK_BITS (WCNT*64) /* some useful definitions for code here */
29 #define KW_TWK_BASE (0)
30 #define KW_KEY_BASE (3)
31 #define ks (kw + KW_KEY_BASE)
32 #define ts (kw + KW_TWK_BASE)
35 #define DebugSaveTweak(ctx) { ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; }
37 #define DebugSaveTweak(ctx)
40 /***************************** Skein_256 ******************************/
41 #if !(SKEIN_USE_ASM & 256)
42 void Skein_256_Process_Block(struct skein_256_ctx
*ctx
, const u8
*blkPtr
, size_t blkCnt
, size_t byteCntAdd
)
45 WCNT
= SKEIN_256_STATE_WORDS
48 #define RCNT (SKEIN_256_ROUNDS_TOTAL/8)
50 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */
51 #define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10)
53 #define SKEIN_UNROLL_256 (0)
57 #if (RCNT % SKEIN_UNROLL_256)
58 #error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */
61 u64 kw
[WCNT
+4+RCNT
*2]; /* key schedule words : chaining vars + tweak + "rotation"*/
63 u64 kw
[WCNT
+4]; /* key schedule words : chaining vars + tweak */
65 u64 X0
, X1
, X2
, X3
; /* local copy of context vars, for speed */
66 u64 w
[WCNT
]; /* local copy of input block */
68 const u64
*Xptr
[4]; /* use for debugging (help compiler put Xn in registers) */
69 Xptr
[0] = &X0
; Xptr
[1] = &X1
; Xptr
[2] = &X2
; Xptr
[3] = &X3
;
71 Skein_assert(blkCnt
!= 0); /* never call with blkCnt == 0! */
75 /* this implementation only supports 2**64 input bytes (no carry out here) */
76 ts
[0] += byteCntAdd
; /* update processed length */
78 /* precompute the key schedule for this block */
83 ks
[4] = ks
[0] ^ ks
[1] ^ ks
[2] ^ ks
[3] ^ SKEIN_KS_PARITY
;
85 ts
[2] = ts
[0] ^ ts
[1];
87 Skein_Get64_LSB_First(w
, blkPtr
, WCNT
); /* get input block in little-endian format */
89 Skein_Show_Block(BLK_BITS
, &ctx
->h
, ctx
->X
, blkPtr
, w
, ks
, ts
);
91 X0
= w
[0] + ks
[0]; /* do the first full key injection */
92 X1
= w
[1] + ks
[1] + ts
[0];
93 X2
= w
[2] + ks
[2] + ts
[1];
96 Skein_Show_R_Ptr(BLK_BITS
, &ctx
->h
, SKEIN_RND_KEY_INITIAL
, Xptr
); /* show starting state values */
98 blkPtr
+= SKEIN_256_BLOCK_BYTES
;
102 #define Round256(p0, p1, p2, p3, ROT, rNum) \
103 X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
104 X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
106 #if SKEIN_UNROLL_256 == 0
107 #define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \
108 Round256(p0, p1, p2, p3, ROT, rNum) \
109 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
112 X0 += ks[((R)+1) % 5]; /* inject the key schedule value */ \
113 X1 += ks[((R)+2) % 5] + ts[((R)+1) % 3]; \
114 X2 += ks[((R)+3) % 5] + ts[((R)+2) % 3]; \
115 X3 += ks[((R)+4) % 5] + (R)+1; \
116 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
117 #else /* looping version */
118 #define R256(p0, p1, p2, p3, ROT, rNum) \
119 Round256(p0, p1, p2, p3, ROT, rNum) \
120 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
123 X0 += ks[r+(R)+0]; /* inject the key schedule value */ \
124 X1 += ks[r+(R)+1] + ts[r+(R)+0]; \
125 X2 += ks[r+(R)+2] + ts[r+(R)+1]; \
126 X3 += ks[r+(R)+3] + r+(R); \
127 ks[r + (R) + 4] = ks[r + (R) - 1]; /* rotate key schedule */\
128 ts[r + (R) + 2] = ts[r + (R) - 1]; \
129 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
131 for (r
= 1; r
< 2 * RCNT
; r
+= 2 * SKEIN_UNROLL_256
) /* loop thru it */
134 #define R256_8_rounds(R) \
135 R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \
136 R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \
137 R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \
138 R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \
140 R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \
141 R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \
142 R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \
143 R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \
148 #define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN)))
177 #if R256_Unroll_R(10)
180 #if R256_Unroll_R(11)
183 #if R256_Unroll_R(12)
186 #if R256_Unroll_R(13)
189 #if R256_Unroll_R(14)
192 #if (SKEIN_UNROLL_256 > 14)
193 #error "need more unrolling in Skein_256_Process_Block"
196 /* do the final "feedforward" xor, update context chaining vars */
197 ctx
->X
[0] = X0
^ w
[0];
198 ctx
->X
[1] = X1
^ w
[1];
199 ctx
->X
[2] = X2
^ w
[2];
200 ctx
->X
[3] = X3
^ w
[3];
202 Skein_Show_Round(BLK_BITS
, &ctx
->h
, SKEIN_RND_FEED_FWD
, ctx
->X
);
204 ts
[1] &= ~SKEIN_T1_FLAG_FIRST
;
211 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
212 size_t Skein_256_Process_Block_CodeSize(void)
214 return ((u8
*) Skein_256_Process_Block_CodeSize
) -
215 ((u8
*) Skein_256_Process_Block
);
217 unsigned int Skein_256_Unroll_Cnt(void)
219 return SKEIN_UNROLL_256
;
224 /***************************** Skein_512 ******************************/
225 #if !(SKEIN_USE_ASM & 512)
226 void Skein_512_Process_Block(struct skein_512_ctx
*ctx
, const u8
*blkPtr
, size_t blkCnt
, size_t byteCntAdd
)
229 WCNT
= SKEIN_512_STATE_WORDS
232 #define RCNT (SKEIN_512_ROUNDS_TOTAL/8)
234 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */
235 #define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10)
237 #define SKEIN_UNROLL_512 (0)
241 #if (RCNT % SKEIN_UNROLL_512)
242 #error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */
245 u64 kw
[WCNT
+4+RCNT
*2]; /* key schedule words : chaining vars + tweak + "rotation"*/
247 u64 kw
[WCNT
+4]; /* key schedule words : chaining vars + tweak */
249 u64 X0
, X1
, X2
, X3
, X4
, X5
, X6
, X7
; /* local copy of vars, for speed */
250 u64 w
[WCNT
]; /* local copy of input block */
252 const u64
*Xptr
[8]; /* use for debugging (help compiler put Xn in registers) */
253 Xptr
[0] = &X0
; Xptr
[1] = &X1
; Xptr
[2] = &X2
; Xptr
[3] = &X3
;
254 Xptr
[4] = &X4
; Xptr
[5] = &X5
; Xptr
[6] = &X6
; Xptr
[7] = &X7
;
257 Skein_assert(blkCnt
!= 0); /* never call with blkCnt == 0! */
261 /* this implementation only supports 2**64 input bytes (no carry out here) */
262 ts
[0] += byteCntAdd
; /* update processed length */
264 /* precompute the key schedule for this block */
273 ks
[8] = ks
[0] ^ ks
[1] ^ ks
[2] ^ ks
[3] ^
274 ks
[4] ^ ks
[5] ^ ks
[6] ^ ks
[7] ^ SKEIN_KS_PARITY
;
276 ts
[2] = ts
[0] ^ ts
[1];
278 Skein_Get64_LSB_First(w
, blkPtr
, WCNT
); /* get input block in little-endian format */
280 Skein_Show_Block(BLK_BITS
, &ctx
->h
, ctx
->X
, blkPtr
, w
, ks
, ts
);
282 X0
= w
[0] + ks
[0]; /* do the first full key injection */
287 X5
= w
[5] + ks
[5] + ts
[0];
288 X6
= w
[6] + ks
[6] + ts
[1];
291 blkPtr
+= SKEIN_512_BLOCK_BYTES
;
293 Skein_Show_R_Ptr(BLK_BITS
, &ctx
->h
, SKEIN_RND_KEY_INITIAL
, Xptr
);
295 #define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
296 X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
297 X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
298 X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4; \
299 X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6; \
301 #if SKEIN_UNROLL_512 == 0
302 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */ \
303 Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
304 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
307 X0 += ks[((R) + 1) % 9]; /* inject the key schedule value */ \
308 X1 += ks[((R) + 2) % 9]; \
309 X2 += ks[((R) + 3) % 9]; \
310 X3 += ks[((R) + 4) % 9]; \
311 X4 += ks[((R) + 5) % 9]; \
312 X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \
313 X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \
314 X7 += ks[((R) + 8) % 9] + (R) + 1; \
315 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
316 #else /* looping version */
317 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
318 Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
319 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
322 X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \
323 X1 += ks[r + (R) + 1]; \
324 X2 += ks[r + (R) + 2]; \
325 X3 += ks[r + (R) + 3]; \
326 X4 += ks[r + (R) + 4]; \
327 X5 += ks[r + (R) + 5] + ts[r + (R) + 0]; \
328 X6 += ks[r + (R) + 6] + ts[r + (R) + 1]; \
329 X7 += ks[r + (R) + 7] + r + (R); \
330 ks[r + (R) + 8] = ks[r + (R) - 1]; /* rotate key schedule */ \
331 ts[r + (R) + 2] = ts[r + (R) - 1]; \
332 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
334 for (r
= 1; r
< 2 * RCNT
; r
+= 2 * SKEIN_UNROLL_512
) /* loop thru it */
335 #endif /* end of looped code definitions */
337 #define R512_8_rounds(R) /* do 8 full rounds */ \
338 R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \
339 R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \
340 R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \
341 R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \
343 R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \
344 R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \
345 R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \
346 R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \
347 I512(2 * (R) + 1); /* and key injection */
351 #define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN)))
380 #if R512_Unroll_R(10)
383 #if R512_Unroll_R(11)
386 #if R512_Unroll_R(12)
389 #if R512_Unroll_R(13)
392 #if R512_Unroll_R(14)
395 #if (SKEIN_UNROLL_512 > 14)
396 #error "need more unrolling in Skein_512_Process_Block"
400 /* do the final "feedforward" xor, update context chaining vars */
401 ctx
->X
[0] = X0
^ w
[0];
402 ctx
->X
[1] = X1
^ w
[1];
403 ctx
->X
[2] = X2
^ w
[2];
404 ctx
->X
[3] = X3
^ w
[3];
405 ctx
->X
[4] = X4
^ w
[4];
406 ctx
->X
[5] = X5
^ w
[5];
407 ctx
->X
[6] = X6
^ w
[6];
408 ctx
->X
[7] = X7
^ w
[7];
409 Skein_Show_Round(BLK_BITS
, &ctx
->h
, SKEIN_RND_FEED_FWD
, ctx
->X
);
411 ts
[1] &= ~SKEIN_T1_FLAG_FIRST
;
418 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
419 size_t Skein_512_Process_Block_CodeSize(void)
421 return ((u8
*) Skein_512_Process_Block_CodeSize
) -
422 ((u8
*) Skein_512_Process_Block
);
424 unsigned int Skein_512_Unroll_Cnt(void)
426 return SKEIN_UNROLL_512
;
431 /***************************** Skein1024 ******************************/
432 #if !(SKEIN_USE_ASM & 1024)
433 void Skein1024_Process_Block(struct skein1024_ctx
*ctx
, const u8
*blkPtr
, size_t blkCnt
, size_t byteCntAdd
)
434 { /* do it in C, always looping (unrolled is bigger AND slower!) */
436 WCNT
= SKEIN1024_STATE_WORDS
439 #define RCNT (SKEIN1024_ROUNDS_TOTAL/8)
441 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */
442 #define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
444 #define SKEIN_UNROLL_1024 (0)
447 #if (SKEIN_UNROLL_1024 != 0)
448 #if (RCNT % SKEIN_UNROLL_1024)
449 #error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */
452 u64 kw
[WCNT
+4+RCNT
*2]; /* key schedule words : chaining vars + tweak + "rotation"*/
454 u64 kw
[WCNT
+4]; /* key schedule words : chaining vars + tweak */
457 u64 X00
, X01
, X02
, X03
, X04
, X05
, X06
, X07
, /* local copy of vars, for speed */
458 X08
, X09
, X10
, X11
, X12
, X13
, X14
, X15
;
459 u64 w
[WCNT
]; /* local copy of input block */
461 const u64
*Xptr
[16]; /* use for debugging (help compiler put Xn in registers) */
462 Xptr
[0] = &X00
; Xptr
[1] = &X01
; Xptr
[2] = &X02
; Xptr
[3] = &X03
;
463 Xptr
[4] = &X04
; Xptr
[5] = &X05
; Xptr
[6] = &X06
; Xptr
[7] = &X07
;
464 Xptr
[8] = &X08
; Xptr
[9] = &X09
; Xptr
[10] = &X10
; Xptr
[11] = &X11
;
465 Xptr
[12] = &X12
; Xptr
[13] = &X13
; Xptr
[14] = &X14
; Xptr
[15] = &X15
;
468 Skein_assert(blkCnt
!= 0); /* never call with blkCnt == 0! */
472 /* this implementation only supports 2**64 input bytes (no carry out here) */
473 ts
[0] += byteCntAdd
; /* update processed length */
475 /* precompute the key schedule for this block */
492 ks
[16] = ks
[0] ^ ks
[1] ^ ks
[2] ^ ks
[3] ^
493 ks
[4] ^ ks
[5] ^ ks
[6] ^ ks
[7] ^
494 ks
[8] ^ ks
[9] ^ ks
[10] ^ ks
[11] ^
495 ks
[12] ^ ks
[13] ^ ks
[14] ^ ks
[15] ^ SKEIN_KS_PARITY
;
497 ts
[2] = ts
[0] ^ ts
[1];
499 Skein_Get64_LSB_First(w
, blkPtr
, WCNT
); /* get input block in little-endian format */
501 Skein_Show_Block(BLK_BITS
, &ctx
->h
, ctx
->X
, blkPtr
, w
, ks
, ts
);
503 X00
= w
[0] + ks
[0]; /* do the first full key injection */
513 X10
= w
[10] + ks
[10];
514 X11
= w
[11] + ks
[11];
515 X12
= w
[12] + ks
[12];
516 X13
= w
[13] + ks
[13] + ts
[0];
517 X14
= w
[14] + ks
[14] + ts
[1];
518 X15
= w
[15] + ks
[15];
520 Skein_Show_R_Ptr(BLK_BITS
, &ctx
->h
, SKEIN_RND_KEY_INITIAL
, Xptr
);
522 #define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rNum) \
523 X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
524 X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
525 X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4; \
526 X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6; \
527 X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8; \
528 X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA; \
529 X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC; \
530 X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE; \
532 #if SKEIN_UNROLL_1024 == 0
533 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
534 Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
535 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr);
538 X00 += ks[((R) + 1) % 17]; /* inject the key schedule value */ \
539 X01 += ks[((R) + 2) % 17]; \
540 X02 += ks[((R) + 3) % 17]; \
541 X03 += ks[((R) + 4) % 17]; \
542 X04 += ks[((R) + 5) % 17]; \
543 X05 += ks[((R) + 6) % 17]; \
544 X06 += ks[((R) + 7) % 17]; \
545 X07 += ks[((R) + 8) % 17]; \
546 X08 += ks[((R) + 9) % 17]; \
547 X09 += ks[((R) + 10) % 17]; \
548 X10 += ks[((R) + 11) % 17]; \
549 X11 += ks[((R) + 12) % 17]; \
550 X12 += ks[((R) + 13) % 17]; \
551 X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3]; \
552 X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3]; \
553 X15 += ks[((R) + 16) % 17] + (R) + 1; \
554 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
555 #else /* looping version */
556 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
557 Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \
558 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr);
561 X00 += ks[r + (R) + 0]; /* inject the key schedule value */ \
562 X01 += ks[r + (R) + 1]; \
563 X02 += ks[r + (R) + 2]; \
564 X03 += ks[r + (R) + 3]; \
565 X04 += ks[r + (R) + 4]; \
566 X05 += ks[r + (R) + 5]; \
567 X06 += ks[r + (R) + 6]; \
568 X07 += ks[r + (R) + 7]; \
569 X08 += ks[r + (R) + 8]; \
570 X09 += ks[r + (R) + 9]; \
571 X10 += ks[r + (R) + 10]; \
572 X11 += ks[r + (R) + 11]; \
573 X12 += ks[r + (R) + 12]; \
574 X13 += ks[r + (R) + 13] + ts[r + (R) + 0]; \
575 X14 += ks[r + (R) + 14] + ts[r + (R) + 1]; \
576 X15 += ks[r + (R) + 15] + r + (R); \
577 ks[r + (R) + 16] = ks[r + (R) - 1]; /* rotate key schedule */\
578 ts[r + (R) + 2] = ts[r + (R) - 1]; \
579 Skein_Show_R_Ptr(BLK_BITSi, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
581 for (r
= 1; r
<= 2 * RCNT
; r
+= 2 * SKEIN_UNROLL_1024
) /* loop thru it */
584 #define R1024_8_rounds(R) /* do 8 full rounds */ \
585 R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, R1024_0, 8*(R) + 1); \
586 R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, 08, 01, R1024_1, 8*(R) + 2); \
587 R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, 10, 09, R1024_2, 8*(R) + 3); \
588 R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, 12, 07, R1024_3, 8*(R) + 4); \
590 R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, R1024_4, 8*(R) + 5); \
591 R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, 08, 01, R1024_5, 8*(R) + 6); \
592 R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, 10, 09, R1024_6, 8*(R) + 7); \
593 R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, 12, 07, R1024_7, 8*(R) + 8); \
598 #define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_1024 > (NN)))
600 #if R1024_Unroll_R(1)
603 #if R1024_Unroll_R(2)
606 #if R1024_Unroll_R(3)
609 #if R1024_Unroll_R(4)
612 #if R1024_Unroll_R(5)
615 #if R1024_Unroll_R(6)
618 #if R1024_Unroll_R(7)
621 #if R1024_Unroll_R(8)
624 #if R1024_Unroll_R(9)
627 #if R1024_Unroll_R(10)
630 #if R1024_Unroll_R(11)
633 #if R1024_Unroll_R(12)
636 #if R1024_Unroll_R(13)
639 #if R1024_Unroll_R(14)
642 #if (SKEIN_UNROLL_1024 > 14)
643 #error "need more unrolling in Skein_1024_Process_Block"
646 /* do the final "feedforward" xor, update context chaining vars */
648 ctx
->X
[0] = X00
^ w
[0];
649 ctx
->X
[1] = X01
^ w
[1];
650 ctx
->X
[2] = X02
^ w
[2];
651 ctx
->X
[3] = X03
^ w
[3];
652 ctx
->X
[4] = X04
^ w
[4];
653 ctx
->X
[5] = X05
^ w
[5];
654 ctx
->X
[6] = X06
^ w
[6];
655 ctx
->X
[7] = X07
^ w
[7];
656 ctx
->X
[8] = X08
^ w
[8];
657 ctx
->X
[9] = X09
^ w
[9];
658 ctx
->X
[10] = X10
^ w
[10];
659 ctx
->X
[11] = X11
^ w
[11];
660 ctx
->X
[12] = X12
^ w
[12];
661 ctx
->X
[13] = X13
^ w
[13];
662 ctx
->X
[14] = X14
^ w
[14];
663 ctx
->X
[15] = X15
^ w
[15];
665 Skein_Show_Round(BLK_BITS
, &ctx
->h
, SKEIN_RND_FEED_FWD
, ctx
->X
);
667 ts
[1] &= ~SKEIN_T1_FLAG_FIRST
;
668 blkPtr
+= SKEIN1024_BLOCK_BYTES
;
675 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
676 size_t Skein1024_Process_Block_CodeSize(void)
678 return ((u8
*) Skein1024_Process_Block_CodeSize
) -
679 ((u8
*) Skein1024_Process_Block
);
681 unsigned int Skein1024_Unroll_Cnt(void)
683 return SKEIN_UNROLL_1024
;