2 * SHA-512 algorithm as described at
\r
4 * http://csrc.nist.gov/cryptval/shs.html
\r
6 * Modifications made for SHA-384 also
\r
13 * Start by deciding whether we can support hardware SHA at all.
\r
15 #define HW_SHA512_NONE 0
\r
16 #define HW_SHA512_NEON 1
\r
18 #ifdef _FORCE_SHA512_NEON
\r
19 # define HW_SHA512 HW_SHA512_NEON
\r
20 #elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
\r
21 /* Arm can potentially support both endiannesses, but this code
\r
22 * hasn't been tested on anything but little. If anyone wants to
\r
23 * run big-endian, they'll need to fix it first. */
\r
24 #elif defined __ARM_FEATURE_SHA512
\r
25 /* If the Arm SHA-512 extension is available already, we can
\r
26 * support NEON SHA without having to enable anything by hand */
\r
27 # define HW_SHA512 HW_SHA512_NEON
\r
28 #elif defined(__clang__)
\r
29 # if __has_attribute(target) && __has_include(<arm_neon.h>) && \
\r
30 (defined(__aarch64__))
\r
31 /* clang can enable the crypto extension in AArch64 using
\r
32 * __attribute__((target)) */
\r
33 # define HW_SHA512 HW_SHA512_NEON
\r
34 # define USE_CLANG_ATTR_TARGET_AARCH64
\r
38 #if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA512
\r
40 # define HW_SHA512 HW_SHA512_NONE
\r
44 * The actual query function that asks if hardware acceleration is
\r
47 static bool sha512_hw_available(void);
\r
50 * The top-level selection function, caching the results of
\r
51 * sha512_hw_available() so it only has to run once.
\r
53 static bool sha512_hw_available_cached(void)
\r
55 static bool initialised = false;
\r
56 static bool hw_available;
\r
58 hw_available = sha512_hw_available();
\r
61 return hw_available;
\r
64 struct sha512_select_options {
\r
65 const ssh_hashalg *hw, *sw;
\r
68 static ssh_hash *sha512_select(const ssh_hashalg *alg)
\r
70 const struct sha512_select_options *options =
\r
71 (const struct sha512_select_options *)alg->extra;
\r
73 const ssh_hashalg *real_alg =
\r
74 sha512_hw_available_cached() ? options->hw : options->sw;
\r
76 return ssh_hash_new(real_alg);
\r
79 const struct sha512_select_options ssh_sha512_select_options = {
\r
80 &ssh_sha512_hw, &ssh_sha512_sw,
\r
82 const struct sha512_select_options ssh_sha384_select_options = {
\r
83 &ssh_sha384_hw, &ssh_sha384_sw,
\r
86 const ssh_hashalg ssh_sha512 = {
\r
87 .new = sha512_select,
\r
90 HASHALG_NAMES_ANNOTATED("SHA-512", "dummy selector vtable"),
\r
91 .extra = &ssh_sha512_select_options,
\r
94 const ssh_hashalg ssh_sha384 = {
\r
95 .new = sha512_select,
\r
98 HASHALG_NAMES_ANNOTATED("SHA-384", "dummy selector vtable"),
\r
99 .extra = &ssh_sha384_select_options,
\r
102 /* ----------------------------------------------------------------------
\r
103 * Definitions likely to be helpful to multiple implementations.
\r
106 static const uint64_t sha512_initial_state[] = {
\r
107 0x6a09e667f3bcc908ULL,
\r
108 0xbb67ae8584caa73bULL,
\r
109 0x3c6ef372fe94f82bULL,
\r
110 0xa54ff53a5f1d36f1ULL,
\r
111 0x510e527fade682d1ULL,
\r
112 0x9b05688c2b3e6c1fULL,
\r
113 0x1f83d9abfb41bd6bULL,
\r
114 0x5be0cd19137e2179ULL,
\r
117 static const uint64_t sha384_initial_state[] = {
\r
118 0xcbbb9d5dc1059ed8ULL,
\r
119 0x629a292a367cd507ULL,
\r
120 0x9159015a3070dd17ULL,
\r
121 0x152fecd8f70e5939ULL,
\r
122 0x67332667ffc00b31ULL,
\r
123 0x8eb44a8768581511ULL,
\r
124 0xdb0c2e0d64f98fa7ULL,
\r
125 0x47b5481dbefa4fa4ULL,
\r
128 static const uint64_t sha512_round_constants[] = {
\r
129 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
\r
130 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
\r
131 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
\r
132 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
\r
133 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
\r
134 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
\r
135 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
\r
136 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
\r
137 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
\r
138 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
\r
139 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
\r
140 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
\r
141 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
\r
142 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
\r
143 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
\r
144 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
\r
145 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
\r
146 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
\r
147 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
\r
148 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
\r
149 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
\r
150 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
\r
151 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
\r
152 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
\r
153 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
\r
154 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
\r
155 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
\r
156 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
\r
157 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
\r
158 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
\r
159 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
\r
160 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
\r
161 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
\r
162 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
\r
163 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
\r
164 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
\r
165 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
\r
166 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
\r
167 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
\r
168 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
\r
171 #define SHA512_ROUNDS 80
\r
173 typedef struct sha512_block sha512_block;
\r
174 struct sha512_block {
\r
175 uint8_t block[128];
\r
177 uint64_t lenhi, lenlo;
\r
180 static inline void sha512_block_setup(sha512_block *blk)
\r
183 blk->lenhi = blk->lenlo = 0;
\r
186 static inline bool sha512_block_write(
\r
187 sha512_block *blk, const void **vdata, size_t *len)
\r
189 size_t blkleft = sizeof(blk->block) - blk->used;
\r
190 size_t chunk = *len < blkleft ? *len : blkleft;
\r
192 const uint8_t *p = *vdata;
\r
193 memcpy(blk->block + blk->used, p, chunk);
\r
194 *vdata = p + chunk;
\r
196 blk->used += chunk;
\r
198 size_t chunkbits = chunk << 3;
\r
200 blk->lenlo += chunkbits;
\r
201 blk->lenhi += (blk->lenlo < chunkbits);
\r
203 if (blk->used == sizeof(blk->block)) {
\r
211 static inline void sha512_block_pad(sha512_block *blk, BinarySink *bs)
\r
213 uint64_t final_lenhi = blk->lenhi;
\r
214 uint64_t final_lenlo = blk->lenlo;
\r
215 size_t pad = 127 & (111 - blk->used);
\r
217 put_byte(bs, 0x80);
\r
218 put_padding(bs, pad, 0);
\r
219 put_uint64(bs, final_lenhi);
\r
220 put_uint64(bs, final_lenlo);
\r
222 assert(blk->used == 0 && "Should have exactly hit a block boundary");
\r
225 /* ----------------------------------------------------------------------
\r
226 * Software implementation of SHA-512.
\r
229 static inline uint64_t ror(uint64_t x, unsigned y)
\r
231 return (x << (63 & -y)) | (x >> (63 & y));
\r
234 static inline uint64_t Ch(uint64_t ctrl, uint64_t if1, uint64_t if0)
\r
236 return if0 ^ (ctrl & (if1 ^ if0));
\r
239 static inline uint64_t Maj(uint64_t x, uint64_t y, uint64_t z)
\r
241 return (x & y) | (z & (x | y));
\r
244 static inline uint64_t Sigma_0(uint64_t x)
\r
246 return ror(x,28) ^ ror(x,34) ^ ror(x,39);
\r
249 static inline uint64_t Sigma_1(uint64_t x)
\r
251 return ror(x,14) ^ ror(x,18) ^ ror(x,41);
\r
254 static inline uint64_t sigma_0(uint64_t x)
\r
256 return ror(x,1) ^ ror(x,8) ^ (x >> 7);
\r
259 static inline uint64_t sigma_1(uint64_t x)
\r
261 return ror(x,19) ^ ror(x,61) ^ (x >> 6);
\r
264 static inline void sha512_sw_round(
\r
265 unsigned round_index, const uint64_t *schedule,
\r
266 uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d,
\r
267 uint64_t *e, uint64_t *f, uint64_t *g, uint64_t *h)
\r
269 uint64_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
\r
270 sha512_round_constants[round_index] + schedule[round_index];
\r
272 uint64_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
\r
278 static void sha512_sw_block(uint64_t *core, const uint8_t *block)
\r
280 uint64_t w[SHA512_ROUNDS];
\r
281 uint64_t a,b,c,d,e,f,g,h;
\r
285 for (t = 0; t < 16; t++)
\r
286 w[t] = GET_64BIT_MSB_FIRST(block + 8*t);
\r
288 for (t = 16; t < SHA512_ROUNDS; t++)
\r
289 w[t] = w[t-16] + w[t-7] + sigma_0(w[t-15]) + sigma_1(w[t-2]);
\r
291 a = core[0]; b = core[1]; c = core[2]; d = core[3];
\r
292 e = core[4]; f = core[5]; g = core[6]; h = core[7];
\r
294 for (t = 0; t < SHA512_ROUNDS; t+=8) {
\r
295 sha512_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
\r
296 sha512_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
\r
297 sha512_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
\r
298 sha512_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
\r
299 sha512_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
\r
300 sha512_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
\r
301 sha512_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
\r
302 sha512_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
\r
305 core[0] += a; core[1] += b; core[2] += c; core[3] += d;
\r
306 core[4] += e; core[5] += f; core[6] += g; core[7] += h;
\r
308 smemclr(w, sizeof(w));
\r
311 typedef struct sha512_sw {
\r
314 BinarySink_IMPLEMENTATION;
\r
318 static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len);
\r
320 static ssh_hash *sha512_sw_new(const ssh_hashalg *alg)
\r
322 sha512_sw *s = snew(sha512_sw);
\r
325 BinarySink_INIT(s, sha512_sw_write);
\r
326 BinarySink_DELEGATE_INIT(&s->hash, s);
\r
330 static void sha512_sw_reset(ssh_hash *hash)
\r
332 sha512_sw *s = container_of(hash, sha512_sw, hash);
\r
334 /* The 'extra' field in the ssh_hashalg indicates which
\r
335 * initialisation vector we're using */
\r
336 memcpy(s->core, hash->vt->extra, sizeof(s->core));
\r
337 sha512_block_setup(&s->blk);
\r
340 static void sha512_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
\r
342 sha512_sw *copy = container_of(hcopy, sha512_sw, hash);
\r
343 sha512_sw *orig = container_of(horig, sha512_sw, hash);
\r
345 memcpy(copy, orig, sizeof(*copy));
\r
346 BinarySink_COPIED(copy);
\r
347 BinarySink_DELEGATE_INIT(©->hash, copy);
\r
350 static void sha512_sw_free(ssh_hash *hash)
\r
352 sha512_sw *s = container_of(hash, sha512_sw, hash);
\r
354 smemclr(s, sizeof(*s));
\r
358 static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len)
\r
360 sha512_sw *s = BinarySink_DOWNCAST(bs, sha512_sw);
\r
363 if (sha512_block_write(&s->blk, &vp, &len))
\r
364 sha512_sw_block(s->core, s->blk.block);
\r
367 static void sha512_sw_digest(ssh_hash *hash, uint8_t *digest)
\r
369 sha512_sw *s = container_of(hash, sha512_sw, hash);
\r
371 sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
\r
372 for (size_t i = 0; i < hash->vt->hlen / 8; i++)
\r
373 PUT_64BIT_MSB_FIRST(digest + 8*i, s->core[i]);
\r
376 const ssh_hashalg ssh_sha512_sw = {
\r
377 .new = sha512_sw_new,
\r
378 .reset = sha512_sw_reset,
\r
379 .copyfrom = sha512_sw_copyfrom,
\r
380 .digest = sha512_sw_digest,
\r
381 .free = sha512_sw_free,
\r
384 HASHALG_NAMES_ANNOTATED("SHA-512", "unaccelerated"),
\r
385 .extra = sha512_initial_state,
\r
388 const ssh_hashalg ssh_sha384_sw = {
\r
389 .new = sha512_sw_new,
\r
390 .reset = sha512_sw_reset,
\r
391 .copyfrom = sha512_sw_copyfrom,
\r
392 .digest = sha512_sw_digest,
\r
393 .free = sha512_sw_free,
\r
396 HASHALG_NAMES_ANNOTATED("SHA-384", "unaccelerated"),
\r
397 .extra = sha384_initial_state,
\r
400 /* ----------------------------------------------------------------------
\r
401 * Hardware-accelerated implementation of SHA-512 using Arm NEON.
\r
404 #if HW_SHA512 == HW_SHA512_NEON
\r
407 * Manually set the target architecture, if we decided above that we
\r
410 #ifdef USE_CLANG_ATTR_TARGET_AARCH64
\r
412 * A spot of cheating: redefine some ACLE feature macros before
\r
413 * including arm_neon.h. Otherwise we won't get the SHA intrinsics
\r
414 * defined by that header, because it will be looking at the settings
\r
415 * for the whole translation unit rather than the ones we're going to
\r
416 * put on some particular functions using __attribute__((target)).
\r
418 #define __ARM_NEON 1
\r
419 #define __ARM_FEATURE_CRYPTO 1
\r
420 #define FUNC_ISA __attribute__ ((target("neon,sha3")))
\r
421 #endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
\r
427 #ifdef USE_ARM64_NEON_H
\r
428 #include <arm64_neon.h>
\r
430 #include <arm_neon.h>
\r
433 static bool sha512_hw_available(void)
\r
436 * For Arm, we delegate to a per-platform detection function (see
\r
437 * explanation in sshaes.c).
\r
439 return platform_sha512_hw_available();
\r
442 #if defined __clang__
\r
444 * As of 2020-12-24, I've found that clang doesn't provide the SHA-512
\r
445 * NEON intrinsics. So I define my own set using inline assembler, and
\r
446 * use #define to effectively rename them over the top of the standard
\r
449 * The aim of that #define technique is that it should avoid a build
\r
450 * failure if these intrinsics _are_ defined in <arm_neon.h>.
\r
451 * Obviously it would be better in that situation to switch back to
\r
452 * using the real intrinsics, but until I see a version of clang that
\r
453 * supports them, I won't know what version number to test in the
\r
456 static inline FUNC_ISA
\r
457 uint64x2_t vsha512su0q_u64_asm(uint64x2_t x, uint64x2_t y) {
\r
458 __asm__("sha512su0 %0.2D,%1.2D" : "+w" (x) : "w" (y));
\r
461 static inline FUNC_ISA
\r
462 uint64x2_t vsha512su1q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
\r
463 __asm__("sha512su1 %0.2D,%1.2D,%2.2D" : "+w" (x) : "w" (y), "w" (z));
\r
466 static inline FUNC_ISA
\r
467 uint64x2_t vsha512hq_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
\r
468 __asm__("sha512h %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
\r
471 static inline FUNC_ISA
\r
472 uint64x2_t vsha512h2q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
\r
473 __asm__("sha512h2 %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
\r
476 #undef vsha512su0q_u64
\r
477 #define vsha512su0q_u64 vsha512su0q_u64_asm
\r
478 #undef vsha512su1q_u64
\r
479 #define vsha512su1q_u64 vsha512su1q_u64_asm
\r
480 #undef vsha512hq_u64
\r
481 #define vsha512hq_u64 vsha512hq_u64_asm
\r
482 #undef vsha512h2q_u64
\r
483 #define vsha512h2q_u64 vsha512h2q_u64_asm
\r
484 #endif /* defined __clang__ */
\r
486 typedef struct sha512_neon_core sha512_neon_core;
\r
487 struct sha512_neon_core {
\r
488 uint64x2_t ab, cd, ef, gh;
\r
492 static inline uint64x2_t sha512_neon_load_input(const uint8_t *p)
\r
494 return vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(p)));
\r
498 static inline uint64x2_t sha512_neon_schedule_update(
\r
499 uint64x2_t m8, uint64x2_t m7, uint64x2_t m4, uint64x2_t m3, uint64x2_t m1)
\r
502 * vsha512su0q_u64() takes words from a long way back in the
\r
503 * schedule and performs the sigma_0 half of the computation of
\r
504 * the next two 64-bit message-schedule words.
\r
506 * vsha512su1q_u64() combines the result of that with the sigma_1
\r
507 * steps, to output the finished version of those two words. The
\r
508 * total amount of input data it requires fits nicely into three
\r
509 * 128-bit vector registers, but one of those registers is
\r
510 * misaligned compared to the 128-bit chunks that the message
\r
511 * schedule is stored in. So we use vextq_u64 to make one of its
\r
512 * input words out of the second half of m4 and the first half of
\r
515 return vsha512su1q_u64(vsha512su0q_u64(m8, m7), m1, vextq_u64(m4, m3, 1));
\r
519 static inline void sha512_neon_round2(
\r
520 unsigned round_index, uint64x2_t schedule_words,
\r
521 uint64x2_t *ab, uint64x2_t *cd, uint64x2_t *ef, uint64x2_t *gh)
\r
524 * vsha512hq_u64 performs the Sigma_1 and Ch half of the
\r
525 * computation of two rounds of SHA-512 (including feeding back
\r
526 * one of the outputs from the first of those half-rounds into the
\r
529 * vsha512h2q_u64 combines the result of that with the Sigma_0 and
\r
530 * Maj steps, and outputs one 128-bit vector that replaces the gh
\r
531 * piece of the input hash state, and a second that updates cd by
\r
534 * Similarly to vsha512su1q_u64 above, some of the input registers
\r
535 * expected by these instructions are misaligned by 64 bits
\r
536 * relative to the chunks we've divided the hash state into, so we
\r
537 * have to start by making 'de' and 'fg' words out of our input
\r
538 * cd,ef,gh, using vextq_u64.
\r
540 * Also, one of the inputs to vsha512hq_u64 is expected to contain
\r
541 * the results of summing gh + two round constants + two words of
\r
542 * message schedule, but the two words of the message schedule
\r
543 * have to be the opposite way round in the vector register from
\r
544 * the way that vsha512su1q_u64 output them. Hence, there's
\r
545 * another vextq_u64 in here that swaps the two halves of the
\r
546 * initial_sum vector register.
\r
548 * (This also means that I don't have to prepare a specially
\r
549 * reordered version of the sha512_round_constants[] array: as
\r
550 * long as I'm unavoidably doing a swap at run time _anyway_, I
\r
551 * can load from the normally ordered version of that array, and
\r
552 * just take care to fold in that data _before_ the swap rather
\r
556 /* Load two round constants, with the first one in the low half */
\r
557 uint64x2_t round_constants = vld1q_u64(
\r
558 sha512_round_constants + round_index);
\r
560 /* Add schedule words to round constants */
\r
561 uint64x2_t initial_sum = vaddq_u64(schedule_words, round_constants);
\r
563 /* Swap that sum around so the word used in the first of the two
\r
564 * rounds is in the _high_ half of the vector, matching where h
\r
565 * lives in the gh vector */
\r
566 uint64x2_t swapped_initial_sum = vextq_u64(initial_sum, initial_sum, 1);
\r
568 /* Add gh to that, now that they're matching ways round */
\r
569 uint64x2_t sum = vaddq_u64(swapped_initial_sum, *gh);
\r
571 /* Make the misaligned de and fg words */
\r
572 uint64x2_t de = vextq_u64(*cd, *ef, 1);
\r
573 uint64x2_t fg = vextq_u64(*ef, *gh, 1);
\r
575 /* Now we're ready to put all the pieces together. The output from
\r
576 * vsha512h2q_u64 can be used directly as the new gh, and the
\r
577 * output from vsha512hq_u64 is simultaneously the intermediate
\r
578 * value passed to h2 and the thing you have to add on to cd. */
\r
579 uint64x2_t intermed = vsha512hq_u64(sum, fg, de);
\r
580 *gh = vsha512h2q_u64(intermed, *cd, *ab);
\r
581 *cd = vaddq_u64(*cd, intermed);
\r
585 static inline void sha512_neon_block(sha512_neon_core *core, const uint8_t *p)
\r
587 uint64x2_t s0, s1, s2, s3, s4, s5, s6, s7;
\r
589 uint64x2_t ab = core->ab, cd = core->cd, ef = core->ef, gh = core->gh;
\r
591 s0 = sha512_neon_load_input(p + 16*0);
\r
592 sha512_neon_round2(0, s0, &ab, &cd, &ef, &gh);
\r
593 s1 = sha512_neon_load_input(p + 16*1);
\r
594 sha512_neon_round2(2, s1, &gh, &ab, &cd, &ef);
\r
595 s2 = sha512_neon_load_input(p + 16*2);
\r
596 sha512_neon_round2(4, s2, &ef, &gh, &ab, &cd);
\r
597 s3 = sha512_neon_load_input(p + 16*3);
\r
598 sha512_neon_round2(6, s3, &cd, &ef, &gh, &ab);
\r
599 s4 = sha512_neon_load_input(p + 16*4);
\r
600 sha512_neon_round2(8, s4, &ab, &cd, &ef, &gh);
\r
601 s5 = sha512_neon_load_input(p + 16*5);
\r
602 sha512_neon_round2(10, s5, &gh, &ab, &cd, &ef);
\r
603 s6 = sha512_neon_load_input(p + 16*6);
\r
604 sha512_neon_round2(12, s6, &ef, &gh, &ab, &cd);
\r
605 s7 = sha512_neon_load_input(p + 16*7);
\r
606 sha512_neon_round2(14, s7, &cd, &ef, &gh, &ab);
\r
607 s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
\r
608 sha512_neon_round2(16, s0, &ab, &cd, &ef, &gh);
\r
609 s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
\r
610 sha512_neon_round2(18, s1, &gh, &ab, &cd, &ef);
\r
611 s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
\r
612 sha512_neon_round2(20, s2, &ef, &gh, &ab, &cd);
\r
613 s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
\r
614 sha512_neon_round2(22, s3, &cd, &ef, &gh, &ab);
\r
615 s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
\r
616 sha512_neon_round2(24, s4, &ab, &cd, &ef, &gh);
\r
617 s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
\r
618 sha512_neon_round2(26, s5, &gh, &ab, &cd, &ef);
\r
619 s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
\r
620 sha512_neon_round2(28, s6, &ef, &gh, &ab, &cd);
\r
621 s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
\r
622 sha512_neon_round2(30, s7, &cd, &ef, &gh, &ab);
\r
623 s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
\r
624 sha512_neon_round2(32, s0, &ab, &cd, &ef, &gh);
\r
625 s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
\r
626 sha512_neon_round2(34, s1, &gh, &ab, &cd, &ef);
\r
627 s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
\r
628 sha512_neon_round2(36, s2, &ef, &gh, &ab, &cd);
\r
629 s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
\r
630 sha512_neon_round2(38, s3, &cd, &ef, &gh, &ab);
\r
631 s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
\r
632 sha512_neon_round2(40, s4, &ab, &cd, &ef, &gh);
\r
633 s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
\r
634 sha512_neon_round2(42, s5, &gh, &ab, &cd, &ef);
\r
635 s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
\r
636 sha512_neon_round2(44, s6, &ef, &gh, &ab, &cd);
\r
637 s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
\r
638 sha512_neon_round2(46, s7, &cd, &ef, &gh, &ab);
\r
639 s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
\r
640 sha512_neon_round2(48, s0, &ab, &cd, &ef, &gh);
\r
641 s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
\r
642 sha512_neon_round2(50, s1, &gh, &ab, &cd, &ef);
\r
643 s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
\r
644 sha512_neon_round2(52, s2, &ef, &gh, &ab, &cd);
\r
645 s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
\r
646 sha512_neon_round2(54, s3, &cd, &ef, &gh, &ab);
\r
647 s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
\r
648 sha512_neon_round2(56, s4, &ab, &cd, &ef, &gh);
\r
649 s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
\r
650 sha512_neon_round2(58, s5, &gh, &ab, &cd, &ef);
\r
651 s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
\r
652 sha512_neon_round2(60, s6, &ef, &gh, &ab, &cd);
\r
653 s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
\r
654 sha512_neon_round2(62, s7, &cd, &ef, &gh, &ab);
\r
655 s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
\r
656 sha512_neon_round2(64, s0, &ab, &cd, &ef, &gh);
\r
657 s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
\r
658 sha512_neon_round2(66, s1, &gh, &ab, &cd, &ef);
\r
659 s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
\r
660 sha512_neon_round2(68, s2, &ef, &gh, &ab, &cd);
\r
661 s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
\r
662 sha512_neon_round2(70, s3, &cd, &ef, &gh, &ab);
\r
663 s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
\r
664 sha512_neon_round2(72, s4, &ab, &cd, &ef, &gh);
\r
665 s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
\r
666 sha512_neon_round2(74, s5, &gh, &ab, &cd, &ef);
\r
667 s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
\r
668 sha512_neon_round2(76, s6, &ef, &gh, &ab, &cd);
\r
669 s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
\r
670 sha512_neon_round2(78, s7, &cd, &ef, &gh, &ab);
\r
672 core->ab = vaddq_u64(core->ab, ab);
\r
673 core->cd = vaddq_u64(core->cd, cd);
\r
674 core->ef = vaddq_u64(core->ef, ef);
\r
675 core->gh = vaddq_u64(core->gh, gh);
\r
678 typedef struct sha512_neon {
\r
679 sha512_neon_core core;
\r
681 BinarySink_IMPLEMENTATION;
\r
685 static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len);
\r
687 static ssh_hash *sha512_neon_new(const ssh_hashalg *alg)
\r
689 if (!sha512_hw_available_cached())
\r
692 sha512_neon *s = snew(sha512_neon);
\r
695 BinarySink_INIT(s, sha512_neon_write);
\r
696 BinarySink_DELEGATE_INIT(&s->hash, s);
\r
700 static void sha512_neon_reset(ssh_hash *hash)
\r
702 sha512_neon *s = container_of(hash, sha512_neon, hash);
\r
703 const uint64_t *iv = (const uint64_t *)hash->vt->extra;
\r
705 s->core.ab = vld1q_u64(iv);
\r
706 s->core.cd = vld1q_u64(iv+2);
\r
707 s->core.ef = vld1q_u64(iv+4);
\r
708 s->core.gh = vld1q_u64(iv+6);
\r
710 sha512_block_setup(&s->blk);
\r
713 static void sha512_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
\r
715 sha512_neon *copy = container_of(hcopy, sha512_neon, hash);
\r
716 sha512_neon *orig = container_of(horig, sha512_neon, hash);
\r
718 *copy = *orig; /* structure copy */
\r
720 BinarySink_COPIED(copy);
\r
721 BinarySink_DELEGATE_INIT(©->hash, copy);
\r
724 static void sha512_neon_free(ssh_hash *hash)
\r
726 sha512_neon *s = container_of(hash, sha512_neon, hash);
\r
727 smemclr(s, sizeof(*s));
\r
731 static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len)
\r
733 sha512_neon *s = BinarySink_DOWNCAST(bs, sha512_neon);
\r
736 if (sha512_block_write(&s->blk, &vp, &len))
\r
737 sha512_neon_block(&s->core, s->blk.block);
\r
740 static void sha512_neon_digest(ssh_hash *hash, uint8_t *digest)
\r
742 sha512_neon *s = container_of(hash, sha512_neon, hash);
\r
744 sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
\r
746 vst1q_u8(digest, vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
\r
747 vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
\r
748 vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
\r
749 vst1q_u8(digest+48, vrev64q_u8(vreinterpretq_u8_u64(s->core.gh)));
\r
752 static void sha384_neon_digest(ssh_hash *hash, uint8_t *digest)
\r
754 sha512_neon *s = container_of(hash, sha512_neon, hash);
\r
756 sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
\r
758 vst1q_u8(digest, vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
\r
759 vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
\r
760 vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
\r
763 const ssh_hashalg ssh_sha512_hw = {
\r
764 .new = sha512_neon_new,
\r
765 .reset = sha512_neon_reset,
\r
766 .copyfrom = sha512_neon_copyfrom,
\r
767 .digest = sha512_neon_digest,
\r
768 .free = sha512_neon_free,
\r
771 HASHALG_NAMES_ANNOTATED("SHA-512", "NEON accelerated"),
\r
772 .extra = sha512_initial_state,
\r
775 const ssh_hashalg ssh_sha384_hw = {
\r
776 .new = sha512_neon_new,
\r
777 .reset = sha512_neon_reset,
\r
778 .copyfrom = sha512_neon_copyfrom,
\r
779 .digest = sha384_neon_digest,
\r
780 .free = sha512_neon_free,
\r
783 HASHALG_NAMES_ANNOTATED("SHA-384", "NEON accelerated"),
\r
784 .extra = sha384_initial_state,
\r
787 /* ----------------------------------------------------------------------
\r
788 * Stub functions if we have no hardware-accelerated SHA-512. In this
\r
789 * case, sha512_hw_new returns NULL (though it should also never be
\r
790 * selected by sha512_select, so the only thing that should even be
\r
791 * _able_ to call it is testcrypt). As a result, the remaining vtable
\r
792 * functions should never be called at all.
\r
795 #elif HW_SHA512 == HW_SHA512_NONE
\r
797 static bool sha512_hw_available(void)
\r
802 static ssh_hash *sha512_stub_new(const ssh_hashalg *alg)
\r
807 #define STUB_BODY { unreachable("Should never be called"); }
\r
809 static void sha512_stub_reset(ssh_hash *hash) STUB_BODY
\r
810 static void sha512_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
\r
811 static void sha512_stub_free(ssh_hash *hash) STUB_BODY
\r
812 static void sha512_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
\r
814 const ssh_hashalg ssh_sha512_hw = {
\r
815 .new = sha512_stub_new,
\r
816 .reset = sha512_stub_reset,
\r
817 .copyfrom = sha512_stub_copyfrom,
\r
818 .digest = sha512_stub_digest,
\r
819 .free = sha512_stub_free,
\r
822 HASHALG_NAMES_ANNOTATED("SHA-512", "!NONEXISTENT ACCELERATED VERSION!"),
\r
825 const ssh_hashalg ssh_sha384_hw = {
\r
826 .new = sha512_stub_new,
\r
827 .reset = sha512_stub_reset,
\r
828 .copyfrom = sha512_stub_copyfrom,
\r
829 .digest = sha512_stub_digest,
\r
830 .free = sha512_stub_free,
\r
833 HASHALG_NAMES_ANNOTATED("SHA-384", "!NONEXISTENT ACCELERATED VERSION!"),
\r
836 #endif /* HW_SHA512 */
\r