libsodium: Needed for Dnscrypto-proxy Release 1.3.0
[tomato.git] / release / src / router / libsodium / src / libsodium / crypto_stream / aes128ctr / portable / common.h
blobaa956eb8b73992c589feb605da70e8fad90eb5b7
1 /* Author: Peter Schwabe, ported from an assembly implementation by Emilia Käsper
2 Date: 2009-03-19
3 Public domain */
4 #ifndef COMMON_H
5 #define COMMON_H
7 #include "types.h"
9 #define load32_bigendian crypto_stream_aes128ctr_portable_load32_bigendian
10 uint32 load32_bigendian(const unsigned char *x);
12 #define store32_bigendian crypto_stream_aes128ctr_portable_store32_bigendian
13 void store32_bigendian(unsigned char *x,uint32 u);
15 #define load32_littleendian crypto_stream_aes128ctr_portable_load32_littleendian
16 uint32 load32_littleendian(const unsigned char *x);
18 #define store32_littleendian crypto_stream_aes128ctr_portable_store32_littleendian
19 void store32_littleendian(unsigned char *x,uint32 u);
21 #define load64_littleendian crypto_stream_aes128ctr_portable_load64_littleendian
22 uint64 load64_littleendian(const unsigned char *x);
24 #define store64_littleendian crypto_stream_aes128ctr_portable_store64_littleendian
25 void store64_littleendian(unsigned char *x,uint64 u);
27 /* Macros required only for key expansion */
29 #define keyexpbs1(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7, bskey) \
30 rotbyte(&b0);\
31 rotbyte(&b1);\
32 rotbyte(&b2);\
33 rotbyte(&b3);\
34 rotbyte(&b4);\
35 rotbyte(&b5);\
36 rotbyte(&b6);\
37 rotbyte(&b7);\
39 sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7);\
41 xor_rcon(&b0);\
42 shufb(&b0, EXPB0);\
43 shufb(&b1, EXPB0);\
44 shufb(&b4, EXPB0);\
45 shufb(&b6, EXPB0);\
46 shufb(&b3, EXPB0);\
47 shufb(&b7, EXPB0);\
48 shufb(&b2, EXPB0);\
49 shufb(&b5, EXPB0);\
50 shufb(&b0, EXPB0);\
52 t0 = *(int128 *)(bskey + 0);\
53 t1 = *(int128 *)(bskey + 16);\
54 t2 = *(int128 *)(bskey + 32);\
55 t3 = *(int128 *)(bskey + 48);\
56 t4 = *(int128 *)(bskey + 64);\
57 t5 = *(int128 *)(bskey + 80);\
58 t6 = *(int128 *)(bskey + 96);\
59 t7 = *(int128 *)(bskey + 112);\
61 xor2(&b0, &t0);\
62 xor2(&b1, &t1);\
63 xor2(&b4, &t2);\
64 xor2(&b6, &t3);\
65 xor2(&b3, &t4);\
66 xor2(&b7, &t5);\
67 xor2(&b2, &t6);\
68 xor2(&b5, &t7);\
70 rshift32_littleendian(&t0, 8);\
71 rshift32_littleendian(&t1, 8);\
72 rshift32_littleendian(&t2, 8);\
73 rshift32_littleendian(&t3, 8);\
74 rshift32_littleendian(&t4, 8);\
75 rshift32_littleendian(&t5, 8);\
76 rshift32_littleendian(&t6, 8);\
77 rshift32_littleendian(&t7, 8);\
79 xor2(&b0, &t0);\
80 xor2(&b1, &t1);\
81 xor2(&b4, &t2);\
82 xor2(&b6, &t3);\
83 xor2(&b3, &t4);\
84 xor2(&b7, &t5);\
85 xor2(&b2, &t6);\
86 xor2(&b5, &t7);\
88 rshift32_littleendian(&t0, 8);\
89 rshift32_littleendian(&t1, 8);\
90 rshift32_littleendian(&t2, 8);\
91 rshift32_littleendian(&t3, 8);\
92 rshift32_littleendian(&t4, 8);\
93 rshift32_littleendian(&t5, 8);\
94 rshift32_littleendian(&t6, 8);\
95 rshift32_littleendian(&t7, 8);\
97 xor2(&b0, &t0);\
98 xor2(&b1, &t1);\
99 xor2(&b4, &t2);\
100 xor2(&b6, &t3);\
101 xor2(&b3, &t4);\
102 xor2(&b7, &t5);\
103 xor2(&b2, &t6);\
104 xor2(&b5, &t7);\
106 rshift32_littleendian(&t0, 8);\
107 rshift32_littleendian(&t1, 8);\
108 rshift32_littleendian(&t2, 8);\
109 rshift32_littleendian(&t3, 8);\
110 rshift32_littleendian(&t4, 8);\
111 rshift32_littleendian(&t5, 8);\
112 rshift32_littleendian(&t6, 8);\
113 rshift32_littleendian(&t7, 8);\
115 xor2(&b0, &t0);\
116 xor2(&b1, &t1);\
117 xor2(&b4, &t2);\
118 xor2(&b6, &t3);\
119 xor2(&b3, &t4);\
120 xor2(&b7, &t5);\
121 xor2(&b2, &t6);\
122 xor2(&b5, &t7);\
124 *(int128 *)(bskey + 128) = b0;\
125 *(int128 *)(bskey + 144) = b1;\
126 *(int128 *)(bskey + 160) = b4;\
127 *(int128 *)(bskey + 176) = b6;\
128 *(int128 *)(bskey + 192) = b3;\
129 *(int128 *)(bskey + 208) = b7;\
130 *(int128 *)(bskey + 224) = b2;\
131 *(int128 *)(bskey + 240) = b5;\
133 #define keyexpbs10(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7, bskey) ;\
134 toggle(&b0);\
135 toggle(&b1);\
136 toggle(&b5);\
137 toggle(&b6);\
138 rotbyte(&b0);\
139 rotbyte(&b1);\
140 rotbyte(&b2);\
141 rotbyte(&b3);\
142 rotbyte(&b4);\
143 rotbyte(&b5);\
144 rotbyte(&b6);\
145 rotbyte(&b7);\
147 sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7);\
149 xor_rcon(&b1);\
150 xor_rcon(&b4);\
151 xor_rcon(&b3);\
152 xor_rcon(&b7);\
153 shufb(&b0, EXPB0);\
154 shufb(&b1, EXPB0);\
155 shufb(&b4, EXPB0);\
156 shufb(&b6, EXPB0);\
157 shufb(&b3, EXPB0);\
158 shufb(&b7, EXPB0);\
159 shufb(&b2, EXPB0);\
160 shufb(&b5, EXPB0);\
162 t0 = *(int128 *)(bskey + 9 * 128 + 0);\
163 t1 = *(int128 *)(bskey + 9 * 128 + 16);\
164 t2 = *(int128 *)(bskey + 9 * 128 + 32);\
165 t3 = *(int128 *)(bskey + 9 * 128 + 48);\
166 t4 = *(int128 *)(bskey + 9 * 128 + 64);\
167 t5 = *(int128 *)(bskey + 9 * 128 + 80);\
168 t6 = *(int128 *)(bskey + 9 * 128 + 96);\
169 t7 = *(int128 *)(bskey + 9 * 128 + 112);\
171 toggle(&t0);\
172 toggle(&t1);\
173 toggle(&t5);\
174 toggle(&t6);\
176 xor2(&b0, &t0);\
177 xor2(&b1, &t1);\
178 xor2(&b4, &t2);\
179 xor2(&b6, &t3);\
180 xor2(&b3, &t4);\
181 xor2(&b7, &t5);\
182 xor2(&b2, &t6);\
183 xor2(&b5, &t7);\
185 rshift32_littleendian(&t0, 8);\
186 rshift32_littleendian(&t1, 8);\
187 rshift32_littleendian(&t2, 8);\
188 rshift32_littleendian(&t3, 8);\
189 rshift32_littleendian(&t4, 8);\
190 rshift32_littleendian(&t5, 8);\
191 rshift32_littleendian(&t6, 8);\
192 rshift32_littleendian(&t7, 8);\
194 xor2(&b0, &t0);\
195 xor2(&b1, &t1);\
196 xor2(&b4, &t2);\
197 xor2(&b6, &t3);\
198 xor2(&b3, &t4);\
199 xor2(&b7, &t5);\
200 xor2(&b2, &t6);\
201 xor2(&b5, &t7);\
203 rshift32_littleendian(&t0, 8);\
204 rshift32_littleendian(&t1, 8);\
205 rshift32_littleendian(&t2, 8);\
206 rshift32_littleendian(&t3, 8);\
207 rshift32_littleendian(&t4, 8);\
208 rshift32_littleendian(&t5, 8);\
209 rshift32_littleendian(&t6, 8);\
210 rshift32_littleendian(&t7, 8);\
212 xor2(&b0, &t0);\
213 xor2(&b1, &t1);\
214 xor2(&b4, &t2);\
215 xor2(&b6, &t3);\
216 xor2(&b3, &t4);\
217 xor2(&b7, &t5);\
218 xor2(&b2, &t6);\
219 xor2(&b5, &t7);\
221 rshift32_littleendian(&t0, 8);\
222 rshift32_littleendian(&t1, 8);\
223 rshift32_littleendian(&t2, 8);\
224 rshift32_littleendian(&t3, 8);\
225 rshift32_littleendian(&t4, 8);\
226 rshift32_littleendian(&t5, 8);\
227 rshift32_littleendian(&t6, 8);\
228 rshift32_littleendian(&t7, 8);\
230 xor2(&b0, &t0);\
231 xor2(&b1, &t1);\
232 xor2(&b4, &t2);\
233 xor2(&b6, &t3);\
234 xor2(&b3, &t4);\
235 xor2(&b7, &t5);\
236 xor2(&b2, &t6);\
237 xor2(&b5, &t7);\
239 shufb(&b0, M0);\
240 shufb(&b1, M0);\
241 shufb(&b2, M0);\
242 shufb(&b3, M0);\
243 shufb(&b4, M0);\
244 shufb(&b5, M0);\
245 shufb(&b6, M0);\
246 shufb(&b7, M0);\
248 *(int128 *)(bskey + 1280) = b0;\
249 *(int128 *)(bskey + 1296) = b1;\
250 *(int128 *)(bskey + 1312) = b4;\
251 *(int128 *)(bskey + 1328) = b6;\
252 *(int128 *)(bskey + 1344) = b3;\
253 *(int128 *)(bskey + 1360) = b7;\
254 *(int128 *)(bskey + 1376) = b2;\
255 *(int128 *)(bskey + 1392) = b5;\
258 #define keyexpbs(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7, rcon, i, bskey) \
259 toggle(&b0);\
260 toggle(&b1);\
261 toggle(&b5);\
262 toggle(&b6);\
263 rotbyte(&b0);\
264 rotbyte(&b1);\
265 rotbyte(&b2);\
266 rotbyte(&b3);\
267 rotbyte(&b4);\
268 rotbyte(&b5);\
269 rotbyte(&b6);\
270 rotbyte(&b7);\
272 sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7);\
274 rcon;\
275 shufb(&b0, EXPB0);\
276 shufb(&b1, EXPB0);\
277 shufb(&b4, EXPB0);\
278 shufb(&b6, EXPB0);\
279 shufb(&b3, EXPB0);\
280 shufb(&b7, EXPB0);\
281 shufb(&b2, EXPB0);\
282 shufb(&b5, EXPB0);\
284 t0 = *(int128 *)(bskey + (i-1) * 128 + 0);\
285 t1 = *(int128 *)(bskey + (i-1) * 128 + 16);\
286 t2 = *(int128 *)(bskey + (i-1) * 128 + 32);\
287 t3 = *(int128 *)(bskey + (i-1) * 128 + 48);\
288 t4 = *(int128 *)(bskey + (i-1) * 128 + 64);\
289 t5 = *(int128 *)(bskey + (i-1) * 128 + 80);\
290 t6 = *(int128 *)(bskey + (i-1) * 128 + 96);\
291 t7 = *(int128 *)(bskey + (i-1) * 128 + 112);\
293 toggle(&t0);\
294 toggle(&t1);\
295 toggle(&t5);\
296 toggle(&t6);\
298 xor2(&b0, &t0);\
299 xor2(&b1, &t1);\
300 xor2(&b4, &t2);\
301 xor2(&b6, &t3);\
302 xor2(&b3, &t4);\
303 xor2(&b7, &t5);\
304 xor2(&b2, &t6);\
305 xor2(&b5, &t7);\
307 rshift32_littleendian(&t0, 8);\
308 rshift32_littleendian(&t1, 8);\
309 rshift32_littleendian(&t2, 8);\
310 rshift32_littleendian(&t3, 8);\
311 rshift32_littleendian(&t4, 8);\
312 rshift32_littleendian(&t5, 8);\
313 rshift32_littleendian(&t6, 8);\
314 rshift32_littleendian(&t7, 8);\
316 xor2(&b0, &t0);\
317 xor2(&b1, &t1);\
318 xor2(&b4, &t2);\
319 xor2(&b6, &t3);\
320 xor2(&b3, &t4);\
321 xor2(&b7, &t5);\
322 xor2(&b2, &t6);\
323 xor2(&b5, &t7);\
325 rshift32_littleendian(&t0, 8);\
326 rshift32_littleendian(&t1, 8);\
327 rshift32_littleendian(&t2, 8);\
328 rshift32_littleendian(&t3, 8);\
329 rshift32_littleendian(&t4, 8);\
330 rshift32_littleendian(&t5, 8);\
331 rshift32_littleendian(&t6, 8);\
332 rshift32_littleendian(&t7, 8);\
334 xor2(&b0, &t0);\
335 xor2(&b1, &t1);\
336 xor2(&b4, &t2);\
337 xor2(&b6, &t3);\
338 xor2(&b3, &t4);\
339 xor2(&b7, &t5);\
340 xor2(&b2, &t6);\
341 xor2(&b5, &t7);\
343 rshift32_littleendian(&t0, 8);\
344 rshift32_littleendian(&t1, 8);\
345 rshift32_littleendian(&t2, 8);\
346 rshift32_littleendian(&t3, 8);\
347 rshift32_littleendian(&t4, 8);\
348 rshift32_littleendian(&t5, 8);\
349 rshift32_littleendian(&t6, 8);\
350 rshift32_littleendian(&t7, 8);\
352 xor2(&b0, &t0);\
353 xor2(&b1, &t1);\
354 xor2(&b4, &t2);\
355 xor2(&b6, &t3);\
356 xor2(&b3, &t4);\
357 xor2(&b7, &t5);\
358 xor2(&b2, &t6);\
359 xor2(&b5, &t7);\
361 *(int128 *)(bskey + i*128 + 0) = b0;\
362 *(int128 *)(bskey + i*128 + 16) = b1;\
363 *(int128 *)(bskey + i*128 + 32) = b4;\
364 *(int128 *)(bskey + i*128 + 48) = b6;\
365 *(int128 *)(bskey + i*128 + 64) = b3;\
366 *(int128 *)(bskey + i*128 + 80) = b7;\
367 *(int128 *)(bskey + i*128 + 96) = b2;\
368 *(int128 *)(bskey + i*128 + 112) = b5;\
370 /* Macros used in multiple contexts */
372 #define bitslicekey0(key, bskey) \
373 xmm0 = *(int128 *) (key + 0);\
374 shufb(&xmm0, M0);\
375 copy2(&xmm1, &xmm0);\
376 copy2(&xmm2, &xmm0);\
377 copy2(&xmm3, &xmm0);\
378 copy2(&xmm4, &xmm0);\
379 copy2(&xmm5, &xmm0);\
380 copy2(&xmm6, &xmm0);\
381 copy2(&xmm7, &xmm0);\
383 bitslice(xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0, t);\
385 *(int128 *) (bskey + 0) = xmm0;\
386 *(int128 *) (bskey + 16) = xmm1;\
387 *(int128 *) (bskey + 32) = xmm2;\
388 *(int128 *) (bskey + 48) = xmm3;\
389 *(int128 *) (bskey + 64) = xmm4;\
390 *(int128 *) (bskey + 80) = xmm5;\
391 *(int128 *) (bskey + 96) = xmm6;\
392 *(int128 *) (bskey + 112) = xmm7;\
395 #define bitslicekey10(key, bskey) \
396 xmm0 = *(int128 *) (key + 0);\
397 copy2(xmm1, xmm0);\
398 copy2(xmm2, xmm0);\
399 copy2(xmm3, xmm0);\
400 copy2(xmm4, xmm0);\
401 copy2(xmm5, xmm0);\
402 copy2(xmm6, xmm0);\
403 copy2(xmm7, xmm0);\
405 bitslice(xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0, t);\
407 toggle(&xmm6);\
408 toggle(&xmm5);\
409 toggle(&xmm1);\
410 toggle(&xmm0);\
412 *(int128 *) (bskey + 0 + 1280) = xmm0;\
413 *(int128 *) (bskey + 16 + 1280) = xmm1;\
414 *(int128 *) (bskey + 32 + 1280) = xmm2;\
415 *(int128 *) (bskey + 48 + 1280) = xmm3;\
416 *(int128 *) (bskey + 64 + 1280) = xmm4;\
417 *(int128 *) (bskey + 80 + 1280) = xmm5;\
418 *(int128 *) (bskey + 96 + 1280) = xmm6;\
419 *(int128 *) (bskey + 112 + 1280) = xmm7;\
422 #define bitslicekey(i,key,bskey) \
423 xmm0 = *(int128 *) (key + 0);\
424 shufb(&xmm0, M0);\
425 copy2(&xmm1, &xmm0);\
426 copy2(&xmm2, &xmm0);\
427 copy2(&xmm3, &xmm0);\
428 copy2(&xmm4, &xmm0);\
429 copy2(&xmm5, &xmm0);\
430 copy2(&xmm6, &xmm0);\
431 copy2(&xmm7, &xmm0);\
433 bitslice(xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0, t);\
435 toggle(&xmm6);\
436 toggle(&xmm5);\
437 toggle(&xmm1);\
438 toggle(&xmm0);\
440 *(int128 *) (bskey + 0 + 128*i) = xmm0;\
441 *(int128 *) (bskey + 16 + 128*i) = xmm1;\
442 *(int128 *) (bskey + 32 + 128*i) = xmm2;\
443 *(int128 *) (bskey + 48 + 128*i) = xmm3;\
444 *(int128 *) (bskey + 64 + 128*i) = xmm4;\
445 *(int128 *) (bskey + 80 + 128*i) = xmm5;\
446 *(int128 *) (bskey + 96 + 128*i) = xmm6;\
447 *(int128 *) (bskey + 112 + 128*i) = xmm7;\
450 #define bitslice(x0, x1, x2, x3, x4, x5, x6, x7, t) \
451 swapmove(x0, x1, 1, BS0, t);\
452 swapmove(x2, x3, 1, BS0, t);\
453 swapmove(x4, x5, 1, BS0, t);\
454 swapmove(x6, x7, 1, BS0, t);\
456 swapmove(x0, x2, 2, BS1, t);\
457 swapmove(x1, x3, 2, BS1, t);\
458 swapmove(x4, x6, 2, BS1, t);\
459 swapmove(x5, x7, 2, BS1, t);\
461 swapmove(x0, x4, 4, BS2, t);\
462 swapmove(x1, x5, 4, BS2, t);\
463 swapmove(x2, x6, 4, BS2, t);\
464 swapmove(x3, x7, 4, BS2, t);\
467 #define swapmove(a, b, n, m, t) \
468 copy2(&t, &b);\
469 rshift64_littleendian(&t, n);\
470 xor2(&t, &a);\
471 and2(&t, &m);\
472 xor2(&a, &t);\
473 lshift64_littleendian(&t, n);\
474 xor2(&b, &t);
476 #define rotbyte(x) \
477 shufb(x, ROTB) /* TODO: Make faster */
480 /* Macros used for encryption (and decryption) */
482 #define shiftrows(x0, x1, x2, x3, x4, x5, x6, x7, i, M, bskey) \
483 xor2(&x0, (int128 *)(bskey + 128*(i-1) + 0));\
484 shufb(&x0, M);\
485 xor2(&x1, (int128 *)(bskey + 128*(i-1) + 16));\
486 shufb(&x1, M);\
487 xor2(&x2, (int128 *)(bskey + 128*(i-1) + 32));\
488 shufb(&x2, M);\
489 xor2(&x3, (int128 *)(bskey + 128*(i-1) + 48));\
490 shufb(&x3, M);\
491 xor2(&x4, (int128 *)(bskey + 128*(i-1) + 64));\
492 shufb(&x4, M);\
493 xor2(&x5, (int128 *)(bskey + 128*(i-1) + 80));\
494 shufb(&x5, M);\
495 xor2(&x6, (int128 *)(bskey + 128*(i-1) + 96));\
496 shufb(&x6, M);\
497 xor2(&x7, (int128 *)(bskey + 128*(i-1) + 112));\
498 shufb(&x7, M);\
501 #define mixcolumns(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, t7) \
502 shufd(&t0, &x0, 0x93);\
503 shufd(&t1, &x1, 0x93);\
504 shufd(&t2, &x2, 0x93);\
505 shufd(&t3, &x3, 0x93);\
506 shufd(&t4, &x4, 0x93);\
507 shufd(&t5, &x5, 0x93);\
508 shufd(&t6, &x6, 0x93);\
509 shufd(&t7, &x7, 0x93);\
511 xor2(&x0, &t0);\
512 xor2(&x1, &t1);\
513 xor2(&x2, &t2);\
514 xor2(&x3, &t3);\
515 xor2(&x4, &t4);\
516 xor2(&x5, &t5);\
517 xor2(&x6, &t6);\
518 xor2(&x7, &t7);\
520 xor2(&t0, &x7);\
521 xor2(&t1, &x0);\
522 xor2(&t2, &x1);\
523 xor2(&t1, &x7);\
524 xor2(&t3, &x2);\
525 xor2(&t4, &x3);\
526 xor2(&t5, &x4);\
527 xor2(&t3, &x7);\
528 xor2(&t6, &x5);\
529 xor2(&t7, &x6);\
530 xor2(&t4, &x7);\
532 shufd(&x0, &x0, 0x4e);\
533 shufd(&x1, &x1, 0x4e);\
534 shufd(&x2, &x2, 0x4e);\
535 shufd(&x3, &x3, 0x4e);\
536 shufd(&x4, &x4, 0x4e);\
537 shufd(&x5, &x5, 0x4e);\
538 shufd(&x6, &x6, 0x4e);\
539 shufd(&x7, &x7, 0x4e);\
541 xor2(&t0, &x0);\
542 xor2(&t1, &x1);\
543 xor2(&t2, &x2);\
544 xor2(&t3, &x3);\
545 xor2(&t4, &x4);\
546 xor2(&t5, &x5);\
547 xor2(&t6, &x6);\
548 xor2(&t7, &x7);\
551 #define aesround(i, b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7, bskey) \
552 shiftrows(b0, b1, b2, b3, b4, b5, b6, b7, i, SR, bskey);\
553 sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7);\
554 mixcolumns(b0, b1, b4, b6, b3, b7, b2, b5, t0, t1, t2, t3, t4, t5, t6, t7);\
557 #define lastround(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7, bskey) \
558 shiftrows(b0, b1, b2, b3, b4, b5, b6, b7, 10, SRM0, bskey);\
559 sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, t4, t5, t6, t7);\
560 xor2(&b0,(int128 *)(bskey + 128*10));\
561 xor2(&b1,(int128 *)(bskey + 128*10+16));\
562 xor2(&b4,(int128 *)(bskey + 128*10+32));\
563 xor2(&b6,(int128 *)(bskey + 128*10+48));\
564 xor2(&b3,(int128 *)(bskey + 128*10+64));\
565 xor2(&b7,(int128 *)(bskey + 128*10+80));\
566 xor2(&b2,(int128 *)(bskey + 128*10+96));\
567 xor2(&b5,(int128 *)(bskey + 128*10+112));\
570 #define sbox(b0, b1, b2, b3, b4, b5, b6, b7, t0, t1, t2, t3, s0, s1, s2, s3) \
571 InBasisChange(b0, b1, b2, b3, b4, b5, b6, b7); \
572 Inv_GF256(b6, b5, b0, b3, b7, b1, b4, b2, t0, t1, t2, t3, s0, s1, s2, s3); \
573 OutBasisChange(b7, b1, b4, b2, b6, b5, b0, b3); \
576 #define InBasisChange(b0, b1, b2, b3, b4, b5, b6, b7) \
577 xor2(&b5, &b6);\
578 xor2(&b2, &b1);\
579 xor2(&b5, &b0);\
580 xor2(&b6, &b2);\
581 xor2(&b3, &b0);\
583 xor2(&b6, &b3);\
584 xor2(&b3, &b7);\
585 xor2(&b3, &b4);\
586 xor2(&b7, &b5);\
587 xor2(&b3, &b1);\
589 xor2(&b4, &b5);\
590 xor2(&b2, &b7);\
591 xor2(&b1, &b5);\
593 #define OutBasisChange(b0, b1, b2, b3, b4, b5, b6, b7) \
594 xor2(&b0, &b6);\
595 xor2(&b1, &b4);\
596 xor2(&b2, &b0);\
597 xor2(&b4, &b6);\
598 xor2(&b6, &b1);\
600 xor2(&b1, &b5);\
601 xor2(&b5, &b3);\
602 xor2(&b2, &b5);\
603 xor2(&b3, &b7);\
604 xor2(&b7, &b5);\
606 xor2(&b4, &b7);\
608 #define Mul_GF4(x0, x1, y0, y1, t0) \
609 copy2(&t0, &y0);\
610 xor2(&t0, &y1);\
611 and2(&t0, &x0);\
612 xor2(&x0, &x1);\
613 and2(&x0, &y1);\
614 and2(&x1, &y0);\
615 xor2(&x0, &x1);\
616 xor2(&x1, &t0);\
618 #define Mul_GF4_N(x0, x1, y0, y1, t0) \
619 copy2(&t0, &y0);\
620 xor2(&t0, &y1);\
621 and2(&t0, &x0);\
622 xor2(&x0, &x1);\
623 and2(&x0, &y1);\
624 and2(&x1, &y0);\
625 xor2(&x1, &x0);\
626 xor2(&x0, &t0);\
628 #define Mul_GF4_2(x0, x1, x2, x3, y0, y1, t0, t1) \
629 copy2(&t0, = y0);\
630 xor2(&t0, &y1);\
631 copy2(&t1, &t0);\
632 and2(&t0, &x0);\
633 and2(&t1, &x2);\
634 xor2(&x0, &x1);\
635 xor2(&x2, &x3);\
636 and2(&x0, &y1);\
637 and2(&x2, &y1);\
638 and2(&x1, &y0);\
639 and2(&x3, &y0);\
640 xor2(&x0, &x1);\
641 xor2(&x2, &x3);\
642 xor2(&x1, &t0);\
643 xor2(&x3, &t1);\
645 #define Mul_GF16(x0, x1, x2, x3, y0, y1, y2, y3, t0, t1, t2, t3) \
646 copy2(&t0, &x0);\
647 copy2(&t1, &x1);\
648 Mul_GF4(x0, x1, y0, y1, t2);\
649 xor2(&t0, &x2);\
650 xor2(&t1, &x3);\
651 xor2(&y0, &y2);\
652 xor2(&y1, &y3);\
653 Mul_GF4_N(t0, t1, y0, y1, t2);\
654 Mul_GF4(x2, x3, y2, y3, t3);\
656 xor2(&x0, &t0);\
657 xor2(&x2, &t0);\
658 xor2(&x1, &t1);\
659 xor2(&x3, &t1);\
661 #define Mul_GF16_2(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, t0, t1, t2, t3) \
662 copy2(&t0, &x0);\
663 copy2(&t1, &x1);\
664 Mul_GF4(x0, x1, y0, y1, t2);\
665 xor2(&t0, &x2);\
666 xor2(&t1, &x3);\
667 xor2(&y0, &y2);\
668 xor2(&y1, &y3);\
669 Mul_GF4_N(t0, t1, y0, y1, t3);\
670 Mul_GF4(x2, x3, y2, y3, t2);\
672 xor2(&x0, &t0);\
673 xor2(&x2, &t0);\
674 xor2(&x1, &t1);\
675 xor2(&x3, &t1);\
677 copy2(&t0, &x4);\
678 copy2(&t1, &x5);\
679 xor2(&t0, &x6);\
680 xor2(&t1, &x7);\
681 Mul_GF4_N(t0, t1, y0, y1, t3);\
682 Mul_GF4(x6, x7, y2, y3, t2);\
683 xor2(&y0, &y2);\
684 xor2(&y1, &y3);\
685 Mul_GF4(x4, x5, y0, y1, t3);\
687 xor2(&x4, &t0);\
688 xor2(&x6, &t0);\
689 xor2(&x5, &t1);\
690 xor2(&x7, &t1);\
692 #define Inv_GF16(x0, x1, x2, x3, t0, t1, t2, t3) \
693 copy2(&t0, &x1);\
694 copy2(&t1, &x0);\
695 and2(&t0, &x3);\
696 or2(&t1, &x2);\
697 copy2(&t2, &x1);\
698 copy2(&t3, &x0);\
699 or2(&t2, &x2);\
700 or2(&t3, &x3);\
701 xor2(&t2, &t3);\
703 xor2(&t0, &t2);\
704 xor2(&t1, &t2);\
706 Mul_GF4_2(x0, x1, x2, x3, t1, t0, t2, t3);\
709 #define Inv_GF256(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, s0, s1, s2, s3) \
710 copy2(&t3, &x4);\
711 copy2(&t2, &x5);\
712 copy2(&t1, &x1);\
713 copy2(&s1, &x7);\
714 copy2(&s0, &x0);\
716 xor2(&t3, &x6);\
717 xor2(&t2, &x7);\
718 xor2(&t1, &x3);\
719 xor2(&s1, &x6);\
720 xor2(&s0, &x2);\
722 copy2(&s2, &t3);\
723 copy2(&t0, &t2);\
724 copy2(&s3, &t3);\
726 or2(&t2, &t1);\
727 or2(&t3, &s0);\
728 xor2(&s3, &t0);\
729 and2(&s2, &s0);\
730 and2(&t0, &t1);\
731 xor2(&s0, &t1);\
732 and2(&s3, &s0);\
733 copy2(&s0, &x3);\
734 xor2(&s0, &x2);\
735 and2(&s1, &s0);\
736 xor2(&t3, &s1);\
737 xor2(&t2, &s1);\
738 copy2(&s1, &x4);\
739 xor2(&s1, &x5);\
740 copy2(&s0, &x1);\
741 copy2(&t1, &s1);\
742 xor2(&s0, &x0);\
743 or2(&t1, &s0);\
744 and2(&s1, &s0);\
745 xor2(&t0, &s1);\
746 xor2(&t3, &s3);\
747 xor2(&t2, &s2);\
748 xor2(&t1, &s3);\
749 xor2(&t0, &s2);\
750 xor2(&t1, &s2);\
751 copy2(&s0, &x7);\
752 copy2(&s1, &x6);\
753 copy2(&s2, &x5);\
754 copy2(&s3, &x4);\
755 and2(&s0, &x3);\
756 and2(&s1, &x2);\
757 and2(&s2, &x1);\
758 or2(&s3, &x0);\
759 xor2(&t3, &s0);\
760 xor2(&t2, &s1);\
761 xor2(&t1, &s2);\
762 xor2(&t0, &s3);\
764 copy2(&s0, &t3);\
765 xor2(&s0, &t2);\
766 and2(&t3, &t1);\
767 copy2(&s2, &t0);\
768 xor2(&s2, &t3);\
769 copy2(&s3, &s0);\
770 and2(&s3, &s2);\
771 xor2(&s3, &t2);\
772 copy2(&s1, &t1);\
773 xor2(&s1, &t0);\
774 xor2(&t3, &t2);\
775 and2(&s1, &t3);\
776 xor2(&s1, &t0);\
777 xor2(&t1, &s1);\
778 copy2(&t2, &s2);\
779 xor2(&t2, &s1);\
780 and2(&t2, &t0);\
781 xor2(&t1, &t2);\
782 xor2(&s2, &t2);\
783 and2(&s2, &s3);\
784 xor2(&s2, &s0);\
786 Mul_GF16_2(x0, x1, x2, x3, x4, x5, x6, x7, s3, s2, s1, t1, s0, t0, t2, t3);\
788 #endif