c++: -frounding-math test [PR109359]
[official-gcc.git] / libiberty / sha1.c
blob49e8e0b6c2b4958674b66255d7f81a70763eefb8
1 /* sha1.c - Functions to compute SHA1 message digest of files or
2 memory blocks according to the NIST specification FIPS-180-1.
4 Copyright (C) 2000-2024 Free Software Foundation, Inc.
6 This program is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 2, or (at your option) any
9 later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20 /* Written by Scott G. Miller
21 Credits:
22 Robert Klep <robert@ilse.nl> -- Expansion function fix
25 #include <config.h>
27 #include "sha1.h"
29 #include <stddef.h>
30 #include <string.h>
32 #ifdef HAVE_X86_SHA1_HW_SUPPORT
33 # include <x86intrin.h>
34 # include <cpuid.h>
35 #endif
37 #if USE_UNLOCKED_IO
38 # include "unlocked-io.h"
39 #endif
41 #ifdef WORDS_BIGENDIAN
42 # define SWAP(n) (n)
43 #else
44 # define SWAP(n) \
45 (((n) << 24) | (((n) & 0xff00) << 8) | (((n) >> 8) & 0xff00) | ((n) >> 24))
46 #endif
48 #define BLOCKSIZE 4096
49 #if BLOCKSIZE % 64 != 0
50 # error "invalid BLOCKSIZE"
51 #endif
53 /* This array contains the bytes used to pad the buffer to the next
54 64-byte boundary. (RFC 1321, 3.1: Step 1) */
55 static const unsigned char fillbuf[64] = { 0x80, 0 /* , 0, 0, ... */ };
58 /* Take a pointer to a 160 bit block of data (five 32 bit ints) and
59 initialize it to the start constants of the SHA1 algorithm. This
60 must be called before using hash in the call to sha1_hash. */
61 void
62 sha1_init_ctx (struct sha1_ctx *ctx)
64 ctx->A = 0x67452301;
65 ctx->B = 0xefcdab89;
66 ctx->C = 0x98badcfe;
67 ctx->D = 0x10325476;
68 ctx->E = 0xc3d2e1f0;
70 ctx->total[0] = ctx->total[1] = 0;
71 ctx->buflen = 0;
74 /* Put result from CTX in first 20 bytes following RESBUF. The result
75 must be in little endian byte order.
77 IMPORTANT: On some systems it is required that RESBUF is correctly
78 aligned for a 32-bit value. */
79 void *
80 sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf)
82 ((sha1_uint32 *) resbuf)[0] = SWAP (ctx->A);
83 ((sha1_uint32 *) resbuf)[1] = SWAP (ctx->B);
84 ((sha1_uint32 *) resbuf)[2] = SWAP (ctx->C);
85 ((sha1_uint32 *) resbuf)[3] = SWAP (ctx->D);
86 ((sha1_uint32 *) resbuf)[4] = SWAP (ctx->E);
88 return resbuf;
91 /* Process the remaining bytes in the internal buffer and the usual
92 prolog according to the standard and write the result to RESBUF.
94 IMPORTANT: On some systems it is required that RESBUF is correctly
95 aligned for a 32-bit value. */
96 void *
97 sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf)
99 /* Take yet unprocessed bytes into account. */
100 sha1_uint32 bytes = ctx->buflen;
101 size_t size = (bytes < 56) ? 64 / 4 : 64 * 2 / 4;
103 /* Now count remaining bytes. */
104 ctx->total[0] += bytes;
105 if (ctx->total[0] < bytes)
106 ++ctx->total[1];
108 /* Put the 64-bit file length in *bits* at the end of the buffer. */
109 ctx->buffer[size - 2] = SWAP ((ctx->total[1] << 3) | (ctx->total[0] >> 29));
110 ctx->buffer[size - 1] = SWAP (ctx->total[0] << 3);
112 memcpy (&((char *) ctx->buffer)[bytes], fillbuf, (size - 2) * 4 - bytes);
114 /* Process last bytes. */
115 sha1_process_block (ctx->buffer, size * 4, ctx);
117 return sha1_read_ctx (ctx, resbuf);
120 /* Compute SHA1 message digest for bytes read from STREAM. The
121 resulting message digest number will be written into the 16 bytes
122 beginning at RESBLOCK. */
124 sha1_stream (FILE *stream, void *resblock)
126 struct sha1_ctx ctx;
127 char buffer[BLOCKSIZE + 72];
128 size_t sum;
130 /* Initialize the computation context. */
131 sha1_init_ctx (&ctx);
133 /* Iterate over full file contents. */
134 while (1)
136 /* We read the file in blocks of BLOCKSIZE bytes. One call of the
137 computation function processes the whole buffer so that with the
138 next round of the loop another block can be read. */
139 size_t n;
140 sum = 0;
142 /* Read block. Take care for partial reads. */
143 while (1)
145 n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
147 sum += n;
149 if (sum == BLOCKSIZE)
150 break;
152 if (n == 0)
154 /* Check for the error flag IFF N == 0, so that we don't
155 exit the loop after a partial read due to e.g., EAGAIN
156 or EWOULDBLOCK. */
157 if (ferror (stream))
158 return 1;
159 goto process_partial_block;
162 /* We've read at least one byte, so ignore errors. But always
163 check for EOF, since feof may be true even though N > 0.
164 Otherwise, we could end up calling fread after EOF. */
165 if (feof (stream))
166 goto process_partial_block;
169 /* Process buffer with BLOCKSIZE bytes. Note that
170 BLOCKSIZE % 64 == 0
172 sha1_process_block (buffer, BLOCKSIZE, &ctx);
175 process_partial_block:;
177 /* Process any remaining bytes. */
178 if (sum > 0)
179 sha1_process_bytes (buffer, sum, &ctx);
181 /* Construct result in desired memory. */
182 sha1_finish_ctx (&ctx, resblock);
183 return 0;
186 /* Compute SHA1 message digest for LEN bytes beginning at BUFFER. The
187 result is always in little endian byte order, so that a byte-wise
188 output yields to the wanted ASCII representation of the message
189 digest. */
190 void *
191 sha1_buffer (const char *buffer, size_t len, void *resblock)
193 struct sha1_ctx ctx;
195 /* Initialize the computation context. */
196 sha1_init_ctx (&ctx);
198 /* Process whole buffer but last len % 64 bytes. */
199 sha1_process_bytes (buffer, len, &ctx);
201 /* Put result in desired memory area. */
202 return sha1_finish_ctx (&ctx, resblock);
205 void
206 sha1_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx)
208 /* When we already have some bits in our internal buffer concatenate
209 both inputs first. */
210 if (ctx->buflen != 0)
212 size_t left_over = ctx->buflen;
213 size_t add = 128 - left_over > len ? len : 128 - left_over;
215 memcpy (&((char *) ctx->buffer)[left_over], buffer, add);
216 ctx->buflen += add;
218 if (ctx->buflen > 64)
220 sha1_process_block (ctx->buffer, ctx->buflen & ~63, ctx);
222 ctx->buflen &= 63;
223 /* The regions in the following copy operation cannot overlap. */
224 memcpy (ctx->buffer,
225 &((char *) ctx->buffer)[(left_over + add) & ~63],
226 ctx->buflen);
229 buffer = (const char *) buffer + add;
230 len -= add;
233 /* Process available complete blocks. */
234 if (len >= 64)
236 #if !_STRING_ARCH_unaligned
237 # define alignof(type) offsetof (struct { char c; type x; }, x)
238 # define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0)
239 if (UNALIGNED_P (buffer))
240 while (len > 64)
242 sha1_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx);
243 buffer = (const char *) buffer + 64;
244 len -= 64;
246 else
247 #endif
249 sha1_process_block (buffer, len & ~63, ctx);
250 buffer = (const char *) buffer + (len & ~63);
251 len &= 63;
255 /* Move remaining bytes in internal buffer. */
256 if (len > 0)
258 size_t left_over = ctx->buflen;
260 memcpy (&((char *) ctx->buffer)[left_over], buffer, len);
261 left_over += len;
262 if (left_over >= 64)
264 sha1_process_block (ctx->buffer, 64, ctx);
265 left_over -= 64;
266 memmove (ctx->buffer, &ctx->buffer[16], left_over);
268 ctx->buflen = left_over;
272 /* --- Code below is the primary difference between md5.c and sha1.c --- */
274 /* SHA1 round constants */
275 #define K1 0x5a827999
276 #define K2 0x6ed9eba1
277 #define K3 0x8f1bbcdc
278 #define K4 0xca62c1d6
280 /* Round functions. Note that F2 is the same as F4. */
281 #define F1(B,C,D) ( D ^ ( B & ( C ^ D ) ) )
282 #define F2(B,C,D) (B ^ C ^ D)
283 #define F3(B,C,D) ( ( B & C ) | ( D & ( B | C ) ) )
284 #define F4(B,C,D) (B ^ C ^ D)
286 /* Process LEN bytes of BUFFER, accumulating context into CTX.
287 It is assumed that LEN % 64 == 0.
288 Most of this code comes from GnuPG's cipher/sha1.c. */
290 void
291 sha1_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx)
293 const sha1_uint32 *words = (const sha1_uint32*) buffer;
294 size_t nwords = len / sizeof (sha1_uint32);
295 const sha1_uint32 *endp = words + nwords;
296 sha1_uint32 x[16];
297 sha1_uint32 a = ctx->A;
298 sha1_uint32 b = ctx->B;
299 sha1_uint32 c = ctx->C;
300 sha1_uint32 d = ctx->D;
301 sha1_uint32 e = ctx->E;
303 /* First increment the byte count. RFC 1321 specifies the possible
304 length of the file up to 2^64 bits. Here we only compute the
305 number of bytes. Do a double word increment. */
306 ctx->total[0] += len;
307 ctx->total[1] += ((len >> 31) >> 1) + (ctx->total[0] < len);
309 #define rol(x, n) (((x) << (n)) | ((sha1_uint32) (x) >> (32 - (n))))
311 #define M(I) ( tm = x[I&0x0f] ^ x[(I-14)&0x0f] \
312 ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f] \
313 , (x[I&0x0f] = rol(tm, 1)) )
315 #define R(A,B,C,D,E,F,K,M) do { E += rol( A, 5 ) \
316 + F( B, C, D ) \
317 + K \
318 + M; \
319 B = rol( B, 30 ); \
320 } while(0)
322 while (words < endp)
324 sha1_uint32 tm;
325 int t;
326 for (t = 0; t < 16; t++)
328 x[t] = SWAP (*words);
329 words++;
332 R( a, b, c, d, e, F1, K1, x[ 0] );
333 R( e, a, b, c, d, F1, K1, x[ 1] );
334 R( d, e, a, b, c, F1, K1, x[ 2] );
335 R( c, d, e, a, b, F1, K1, x[ 3] );
336 R( b, c, d, e, a, F1, K1, x[ 4] );
337 R( a, b, c, d, e, F1, K1, x[ 5] );
338 R( e, a, b, c, d, F1, K1, x[ 6] );
339 R( d, e, a, b, c, F1, K1, x[ 7] );
340 R( c, d, e, a, b, F1, K1, x[ 8] );
341 R( b, c, d, e, a, F1, K1, x[ 9] );
342 R( a, b, c, d, e, F1, K1, x[10] );
343 R( e, a, b, c, d, F1, K1, x[11] );
344 R( d, e, a, b, c, F1, K1, x[12] );
345 R( c, d, e, a, b, F1, K1, x[13] );
346 R( b, c, d, e, a, F1, K1, x[14] );
347 R( a, b, c, d, e, F1, K1, x[15] );
348 R( e, a, b, c, d, F1, K1, M(16) );
349 R( d, e, a, b, c, F1, K1, M(17) );
350 R( c, d, e, a, b, F1, K1, M(18) );
351 R( b, c, d, e, a, F1, K1, M(19) );
352 R( a, b, c, d, e, F2, K2, M(20) );
353 R( e, a, b, c, d, F2, K2, M(21) );
354 R( d, e, a, b, c, F2, K2, M(22) );
355 R( c, d, e, a, b, F2, K2, M(23) );
356 R( b, c, d, e, a, F2, K2, M(24) );
357 R( a, b, c, d, e, F2, K2, M(25) );
358 R( e, a, b, c, d, F2, K2, M(26) );
359 R( d, e, a, b, c, F2, K2, M(27) );
360 R( c, d, e, a, b, F2, K2, M(28) );
361 R( b, c, d, e, a, F2, K2, M(29) );
362 R( a, b, c, d, e, F2, K2, M(30) );
363 R( e, a, b, c, d, F2, K2, M(31) );
364 R( d, e, a, b, c, F2, K2, M(32) );
365 R( c, d, e, a, b, F2, K2, M(33) );
366 R( b, c, d, e, a, F2, K2, M(34) );
367 R( a, b, c, d, e, F2, K2, M(35) );
368 R( e, a, b, c, d, F2, K2, M(36) );
369 R( d, e, a, b, c, F2, K2, M(37) );
370 R( c, d, e, a, b, F2, K2, M(38) );
371 R( b, c, d, e, a, F2, K2, M(39) );
372 R( a, b, c, d, e, F3, K3, M(40) );
373 R( e, a, b, c, d, F3, K3, M(41) );
374 R( d, e, a, b, c, F3, K3, M(42) );
375 R( c, d, e, a, b, F3, K3, M(43) );
376 R( b, c, d, e, a, F3, K3, M(44) );
377 R( a, b, c, d, e, F3, K3, M(45) );
378 R( e, a, b, c, d, F3, K3, M(46) );
379 R( d, e, a, b, c, F3, K3, M(47) );
380 R( c, d, e, a, b, F3, K3, M(48) );
381 R( b, c, d, e, a, F3, K3, M(49) );
382 R( a, b, c, d, e, F3, K3, M(50) );
383 R( e, a, b, c, d, F3, K3, M(51) );
384 R( d, e, a, b, c, F3, K3, M(52) );
385 R( c, d, e, a, b, F3, K3, M(53) );
386 R( b, c, d, e, a, F3, K3, M(54) );
387 R( a, b, c, d, e, F3, K3, M(55) );
388 R( e, a, b, c, d, F3, K3, M(56) );
389 R( d, e, a, b, c, F3, K3, M(57) );
390 R( c, d, e, a, b, F3, K3, M(58) );
391 R( b, c, d, e, a, F3, K3, M(59) );
392 R( a, b, c, d, e, F4, K4, M(60) );
393 R( e, a, b, c, d, F4, K4, M(61) );
394 R( d, e, a, b, c, F4, K4, M(62) );
395 R( c, d, e, a, b, F4, K4, M(63) );
396 R( b, c, d, e, a, F4, K4, M(64) );
397 R( a, b, c, d, e, F4, K4, M(65) );
398 R( e, a, b, c, d, F4, K4, M(66) );
399 R( d, e, a, b, c, F4, K4, M(67) );
400 R( c, d, e, a, b, F4, K4, M(68) );
401 R( b, c, d, e, a, F4, K4, M(69) );
402 R( a, b, c, d, e, F4, K4, M(70) );
403 R( e, a, b, c, d, F4, K4, M(71) );
404 R( d, e, a, b, c, F4, K4, M(72) );
405 R( c, d, e, a, b, F4, K4, M(73) );
406 R( b, c, d, e, a, F4, K4, M(74) );
407 R( a, b, c, d, e, F4, K4, M(75) );
408 R( e, a, b, c, d, F4, K4, M(76) );
409 R( d, e, a, b, c, F4, K4, M(77) );
410 R( c, d, e, a, b, F4, K4, M(78) );
411 R( b, c, d, e, a, F4, K4, M(79) );
413 a = ctx->A += a;
414 b = ctx->B += b;
415 c = ctx->C += c;
416 d = ctx->D += d;
417 e = ctx->E += e;
421 #if defined(HAVE_X86_SHA1_HW_SUPPORT)
422 /* HW specific version of sha1_process_bytes. */
424 static void sha1_hw_process_block (const void *, size_t, struct sha1_ctx *);
426 static void
427 sha1_hw_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx)
429 /* When we already have some bits in our internal buffer concatenate
430 both inputs first. */
431 if (ctx->buflen != 0)
433 size_t left_over = ctx->buflen;
434 size_t add = 128 - left_over > len ? len : 128 - left_over;
436 memcpy (&((char *) ctx->buffer)[left_over], buffer, add);
437 ctx->buflen += add;
439 if (ctx->buflen > 64)
441 sha1_hw_process_block (ctx->buffer, ctx->buflen & ~63, ctx);
443 ctx->buflen &= 63;
444 /* The regions in the following copy operation cannot overlap. */
445 memcpy (ctx->buffer,
446 &((char *) ctx->buffer)[(left_over + add) & ~63],
447 ctx->buflen);
450 buffer = (const char *) buffer + add;
451 len -= add;
454 /* Process available complete blocks. */
455 if (len >= 64)
457 #if !_STRING_ARCH_unaligned
458 # define alignof(type) offsetof (struct { char c; type x; }, x)
459 # define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0)
460 if (UNALIGNED_P (buffer))
461 while (len > 64)
463 sha1_hw_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx);
464 buffer = (const char *) buffer + 64;
465 len -= 64;
467 else
468 #endif
470 sha1_hw_process_block (buffer, len & ~63, ctx);
471 buffer = (const char *) buffer + (len & ~63);
472 len &= 63;
476 /* Move remaining bytes in internal buffer. */
477 if (len > 0)
479 size_t left_over = ctx->buflen;
481 memcpy (&((char *) ctx->buffer)[left_over], buffer, len);
482 left_over += len;
483 if (left_over >= 64)
485 sha1_hw_process_block (ctx->buffer, 64, ctx);
486 left_over -= 64;
487 memmove (ctx->buffer, &ctx->buffer[16], left_over);
489 ctx->buflen = left_over;
493 /* Process LEN bytes of BUFFER, accumulating context into CTX.
494 Using CPU specific intrinsics. */
496 #ifdef HAVE_X86_SHA1_HW_SUPPORT
497 __attribute__((__target__ ("sse4.1,sha")))
498 #endif
499 static void
500 sha1_hw_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx)
502 #ifdef HAVE_X86_SHA1_HW_SUPPORT
503 /* Implemented from
504 https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html */
505 const __m128i *words = (const __m128i *) buffer;
506 const __m128i *endp = (const __m128i *) ((const char *) buffer + len);
507 __m128i abcd, abcd_save, e0, e0_save, e1, msg0, msg1, msg2, msg3;
508 const __m128i shuf_mask
509 = _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
510 char check[((offsetof (struct sha1_ctx, B)
511 == offsetof (struct sha1_ctx, A) + sizeof (ctx->A))
512 && (offsetof (struct sha1_ctx, C)
513 == offsetof (struct sha1_ctx, A) + 2 * sizeof (ctx->A))
514 && (offsetof (struct sha1_ctx, D)
515 == offsetof (struct sha1_ctx, A) + 3 * sizeof (ctx->A)))
516 ? 1 : -1];
518 /* First increment the byte count. RFC 1321 specifies the possible
519 length of the file up to 2^64 bits. Here we only compute the
520 number of bytes. Do a double word increment. */
521 ctx->total[0] += len;
522 ctx->total[1] += ((len >> 31) >> 1) + (ctx->total[0] < len);
524 (void) &check[0];
525 abcd = _mm_loadu_si128 ((const __m128i *) &ctx->A);
526 e0 = _mm_set_epi32 (ctx->E, 0, 0, 0);
527 abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */
529 while (words < endp)
531 abcd_save = abcd;
532 e0_save = e0;
534 /* 0..3 */
535 msg0 = _mm_loadu_si128 (words);
536 msg0 = _mm_shuffle_epi8 (msg0, shuf_mask);
537 e0 = _mm_add_epi32 (e0, msg0);
538 e1 = abcd;
539 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
541 /* 4..7 */
542 msg1 = _mm_loadu_si128 (words + 1);
543 msg1 = _mm_shuffle_epi8 (msg1, shuf_mask);
544 e1 = _mm_sha1nexte_epu32 (e1, msg1);
545 e0 = abcd;
546 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0);
547 msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
549 /* 8..11 */
550 msg2 = _mm_loadu_si128 (words + 2);
551 msg2 = _mm_shuffle_epi8 (msg2, shuf_mask);
552 e0 = _mm_sha1nexte_epu32 (e0, msg2);
553 e1 = abcd;
554 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
555 msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
556 msg0 = _mm_xor_si128 (msg0, msg2);
558 /* 12..15 */
559 msg3 = _mm_loadu_si128 (words + 3);
560 msg3 = _mm_shuffle_epi8 (msg3, shuf_mask);
561 e1 = _mm_sha1nexte_epu32 (e1, msg3);
562 e0 = abcd;
563 msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
564 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0);
565 msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
566 msg1 = _mm_xor_si128 (msg1, msg3);
568 /* 16..19 */
569 e0 = _mm_sha1nexte_epu32 (e0, msg0);
570 e1 = abcd;
571 msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
572 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
573 msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
574 msg2 = _mm_xor_si128 (msg2, msg0);
576 /* 20..23 */
577 e1 = _mm_sha1nexte_epu32 (e1, msg1);
578 e0 = abcd;
579 msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
580 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
581 msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
582 msg3 = _mm_xor_si128 (msg3, msg1);
584 /* 24..27 */
585 e0 = _mm_sha1nexte_epu32 (e0, msg2);
586 e1 = abcd;
587 msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
588 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1);
589 msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
590 msg0 = _mm_xor_si128 (msg0, msg2);
592 /* 28..31 */
593 e1 = _mm_sha1nexte_epu32 (e1, msg3);
594 e0 = abcd;
595 msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
596 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
597 msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
598 msg1 = _mm_xor_si128 (msg1, msg3);
600 /* 32..35 */
601 e0 = _mm_sha1nexte_epu32 (e0, msg0);
602 e1 = abcd;
603 msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
604 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1);
605 msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
606 msg2 = _mm_xor_si128 (msg2, msg0);
608 /* 36..39 */
609 e1 = _mm_sha1nexte_epu32 (e1, msg1);
610 e0 = abcd;
611 msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
612 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
613 msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
614 msg3 = _mm_xor_si128 (msg3, msg1);
616 /* 40..43 */
617 e0 = _mm_sha1nexte_epu32 (e0, msg2);
618 e1 = abcd;
619 msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
620 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
621 msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
622 msg0 = _mm_xor_si128 (msg0, msg2);
624 /* 44..47 */
625 e1 = _mm_sha1nexte_epu32 (e1, msg3);
626 e0 = abcd;
627 msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
628 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2);
629 msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
630 msg1 = _mm_xor_si128 (msg1, msg3);
632 /* 48..51 */
633 e0 = _mm_sha1nexte_epu32 (e0, msg0);
634 e1 = abcd;
635 msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
636 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
637 msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
638 msg2 = _mm_xor_si128 (msg2, msg0);
640 /* 52..55 */
641 e1 = _mm_sha1nexte_epu32 (e1, msg1);
642 e0 = abcd;
643 msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
644 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2);
645 msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
646 msg3 = _mm_xor_si128 (msg3, msg1);
648 /* 56..59 */
649 e0 = _mm_sha1nexte_epu32 (e0, msg2);
650 e1 = abcd;
651 msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
652 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
653 msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
654 msg0 = _mm_xor_si128 (msg0, msg2);
656 /* 60..63 */
657 e1 = _mm_sha1nexte_epu32 (e1, msg3);
658 e0 = abcd;
659 msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
660 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
661 msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
662 msg1 = _mm_xor_si128 (msg1, msg3);
664 /* 64..67 */
665 e0 = _mm_sha1nexte_epu32 (e0, msg0);
666 e1 = abcd;
667 msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
668 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3);
669 msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
670 msg2 = _mm_xor_si128 (msg2, msg0);
672 /* 68..71 */
673 e1 = _mm_sha1nexte_epu32 (e1, msg1);
674 e0 = abcd;
675 msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
676 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
677 msg3 = _mm_xor_si128 (msg3, msg1);
679 /* 72..75 */
680 e0 = _mm_sha1nexte_epu32 (e0, msg2);
681 e1 = abcd;
682 msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
683 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3);
685 /* 76..79 */
686 e1 = _mm_sha1nexte_epu32 (e1, msg3);
687 e0 = abcd;
688 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
690 /* Finalize. */
691 e0 = _mm_sha1nexte_epu32 (e0, e0_save);
692 abcd = _mm_add_epi32 (abcd, abcd_save);
694 words = words + 4;
697 abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */
698 _mm_storeu_si128 ((__m128i *) &ctx->A, abcd);
699 ctx->E = _mm_extract_epi32 (e0, 3);
700 #endif
702 #endif
704 /* Return sha1_process_bytes or some hardware optimized version thereof
705 depending on current CPU. */
707 sha1_process_bytes_fn
708 sha1_choose_process_bytes (void)
710 #ifdef HAVE_X86_SHA1_HW_SUPPORT
711 unsigned int eax, ebx, ecx, edx;
712 if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)
713 && (ebx & bit_SHA) != 0
714 && __get_cpuid (1, &eax, &ebx, &ecx, &edx)
715 && (ecx & bit_SSE4_1) != 0)
716 return sha1_hw_process_bytes;
717 #endif
718 return sha1_process_bytes;