3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # sha1_block procedure for x86_64.
12 # It was brought to my attention that on EM64T compiler-generated code
13 # was far behind 32-bit assembler implementation. This is unlike on
14 # Opteron where compiler-generated code was only 15% behind 32-bit
15 # assembler, which originally made it hard to motivate the effort.
16 # There was suggestion to mechanically translate 32-bit code, but I
17 # dismissed it, reasoning that x86_64 offers enough register bank
18 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19 # implementation:-) However! While 64-bit code does perform better
20 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21 # x86_64 does offer larger *addressable* bank, but out-of-order core
22 # reaches for even more registers through dynamic aliasing, and EM64T
23 # core must have managed to run-time optimize even 32-bit code just as
24 # good as 64-bit one. Performance improvement is summarized in the
27 # gcc 3.4 32-bit asm cycles/byte
28 # Opteron +45% +20% 6.8
29 # Xeon P4 +65% +0% 9.9
34 # The code was revised to minimize code size and to maximize
35 # "distance" between instructions producing input to 'lea'
36 # instruction and the 'lea' instruction itself, which is essential
37 # for Intel Atom core.
41 # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42 # is to offload message schedule denoted by Wt in NIST specification,
43 # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44 # for background and implementation details. The only difference from
45 # 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46 # to free temporary registers.
50 # Add AVX code path. See sha1-586.pl for further information.
52 ######################################################################
53 # Current performance is summarized in following table. Numbers are
54 # CPU clock cycles spent to process single byte (less is better).
59 # Core2 6.7 6.1/+10% -
60 # Atom 11.0 9.7/+13% -
61 # Westmere 7.1 5.6/+27% -
62 # Sandy Bridge 7.9 6.3/+25% 5.2/+51%
66 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
68 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
70 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
71 ( $xlate="${dir}x86_64-xlate.pl" and -f
$xlate ) or
72 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f
$xlate) or
73 die "can't locate x86_64-xlate.pl";
75 $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
76 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
78 $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM
} =~ /nasm/) &&
79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
81 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM
} =~ /ml64/) &&
82 `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
85 open OUT
,"| \"$^X\" $xlate $flavour $output";
88 $ctx="%rdi"; # 1st arg
89 $inp="%rsi"; # 2nd arg
90 $num="%rdx"; # 3rd arg
92 # reassign arguments in order to produce more compact code
110 my ($i,$a,$b,$c,$d,$e)=@_;
112 $code.=<<___
if ($i==0);
113 mov
`4*$i`($inp),$xi[0]
115 mov
$xi[0],`4*$i`(%rsp)
117 $code.=<<___
if ($i<15);
119 mov
`4*$j`($inp),$xi[1]
124 lea
0x5a827999($xi[0],$e),$e
126 mov
$xi[1],`4*$j`(%rsp)
132 $code.=<<___
if ($i>=15);
133 mov
`4*($j%16)`(%rsp),$xi[1]
136 xor `4*(($j+2)%16)`(%rsp),$xi[1]
139 xor `4*(($j+8)%16)`(%rsp),$xi[1]
141 lea
0x5a827999($xi[0],$e),$e
142 xor `4*(($j+13)%16)`(%rsp),$xi[1]
147 mov
$xi[1],`4*($j%16)`(%rsp)
150 unshift(@xi,pop(@xi));
154 my ($i,$a,$b,$c,$d,$e)=@_;
156 my $K=($i<40)?
0x6ed9eba1:0xca62c1d6;
157 $code.=<<___
if ($i<79);
158 mov
`4*($j%16)`(%rsp),$xi[1]
161 xor `4*(($j+2)%16)`(%rsp),$xi[1]
165 xor `4*(($j+8)%16)`(%rsp),$xi[1]
168 xor `4*(($j+13)%16)`(%rsp),$xi[1]
173 $code.=<<___
if ($i<76);
174 mov
$xi[1],`4*($j%16)`(%rsp)
176 $code.=<<___
if ($i==79);
187 unshift(@xi,pop(@xi));
191 my ($i,$a,$b,$c,$d,$e)=@_;
194 mov
`4*($j%16)`(%rsp),$xi[1]
197 xor `4*(($j+2)%16)`(%rsp),$xi[1]
200 xor `4*(($j+8)%16)`(%rsp),$xi[1]
202 lea
0x8f1bbcdc($xi[0],$e),$e
204 xor `4*(($j+13)%16)`(%rsp),$xi[1]
210 mov
$xi[1],`4*($j%16)`(%rsp)
213 unshift(@xi,pop(@xi));
218 .extern OPENSSL_ia32cap_P
219 .hidden OPENSSL_ia32cap_P
221 .globl sha1_block_data_order
222 .type sha1_block_data_order
,\
@function,3
224 sha1_block_data_order
:
225 mov OPENSSL_ia32cap_P
+0(%rip),%r9d
226 mov OPENSSL_ia32cap_P
+4(%rip),%r8d
227 test \
$IA32CAP_MASK1_SSSE3,%r8d # check SSSE3 bit
230 $code.=<<___
if ($avx);
231 and \
$IA32CAP_MASK1_AVX,%r8d # mask AVX bit
232 and \
$IA32CAP_MASK0_INTEL,%r9d # mask "Intel CPU" bit
234 cmp \
$(IA32CAP_MASK0_INTEL
| IA32CAP_MASK1_AVX
),%r8d
247 mov
%rdi,$ctx # reassigned argument
249 mov
%rsi,$inp # reassigned argument
251 mov
%rdx,$num # reassigned argument
252 mov
%r11,`16*4`(%rsp)
265 for($i=0;$i<20;$i++) { &BODY_00_19
($i,@V); unshift(@V,pop(@V)); }
266 for(;$i<40;$i++) { &BODY_20_39
($i,@V); unshift(@V,pop(@V)); }
267 for(;$i<60;$i++) { &BODY_40_59
($i,@V); unshift(@V,pop(@V)); }
268 for(;$i<80;$i++) { &BODY_20_39
($i,@V); unshift(@V,pop(@V)); }
282 lea
`16*4`($inp),$inp
285 mov
`16*4`(%rsp),%rsi
293 .size sha1_block_data_order
,.-sha1_block_data_order
297 my @X=map("%xmm$_",(4..7,0..3));
298 my @Tx=map("%xmm$_",(8..10));
299 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
300 my @T=("%esi","%edi");
304 my $_rol=sub { &rol
(@_) };
305 my $_ror=sub { &ror
(@_) };
308 .type sha1_block_data_order_ssse3
,\
@function,3
310 sha1_block_data_order_ssse3
:
315 lea
`-64-($win64?5*16:0)`(%rsp),%rsp
317 $code.=<<___
if ($win64);
318 movaps
%xmm6,64+0(%rsp)
319 movaps
%xmm7,64+16(%rsp)
320 movaps
%xmm8,64+32(%rsp)
321 movaps
%xmm9,64+48(%rsp)
322 movaps
%xmm10,64+64(%rsp)
326 mov
%rdi,$ctx # reassigned argument
327 mov
%rsi,$inp # reassigned argument
328 mov
%rdx,$num # reassigned argument
332 lea K_XX_XX
(%rip),$K_XX_XX
334 mov
0($ctx),$A # load context
338 mov
$B,@T[0] # magic seed
341 movdqa
64($K_XX_XX),@X[2] # pbswap mask
342 movdqa
0($K_XX_XX),@Tx[1] # K_00_19
343 movdqu
0($inp),@X[-4&7] # load input to %xmm[0-3]
344 movdqu
16($inp),@X[-3&7]
345 movdqu
32($inp),@X[-2&7]
346 movdqu
48($inp),@X[-1&7]
347 pshufb
@X[2],@X[-4&7] # byte swap
349 pshufb
@X[2],@X[-3&7]
350 pshufb
@X[2],@X[-2&7]
351 pshufb
@X[2],@X[-1&7]
352 paddd
@Tx[1],@X[-4&7] # add K_00_19
353 paddd
@Tx[1],@X[-3&7]
354 paddd
@Tx[1],@X[-2&7]
355 movdqa
@X[-4&7],0(%rsp) # X[]+K xfer to IALU
356 psubd
@Tx[1],@X[-4&7] # restore X[]
357 movdqa
@X[-3&7],16(%rsp)
358 psubd
@Tx[1],@X[-3&7]
359 movdqa
@X[-2&7],32(%rsp)
360 psubd
@Tx[1],@X[-2&7]
364 sub AUTOLOAD
() # thunk [simplified] 32-bit style perlasm
365 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
367 $arg = "\$$arg" if ($arg*1 eq $arg);
368 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
371 sub Xupdate_ssse3_16_31
() # recall that $Xi starts wtih 4
374 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
377 &movdqa
(@X[0],@X[-3&7]);
380 &movdqa
(@Tx[0],@X[-1&7]);
381 &palignr
(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
385 &paddd
(@Tx[1],@X[-1&7]);
388 &psrldq
(@Tx[0],4); # "X[-3]", 3 dwords
391 &pxor
(@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
395 &pxor
(@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
401 &pxor
(@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
404 &movdqa
(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
408 &movdqa
(@Tx[2],@X[0]);
409 &movdqa
(@Tx[0],@X[0]);
415 &pslldq
(@Tx[2],12); # "X[0]"<<96, extract one dword
416 &paddd
(@X[0],@X[0]);
425 &movdqa
(@Tx[1],@Tx[2]);
430 &por
(@X[0],@Tx[0]); # "X[0]"<<<=1
437 &pxor
(@X[0],@Tx[2]);
440 &movdqa
(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
444 &pxor
(@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
446 foreach (@insns) { eval; } # remaining instructions [if any]
448 $Xi++; push(@X,shift(@X)); # "rotate" X[]
449 push(@Tx,shift(@Tx));
452 sub Xupdate_ssse3_32_79
()
455 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
458 &movdqa
(@Tx[0],@X[-1&7]) if ($Xi==8);
459 eval(shift(@insns)); # body_20_39
460 &pxor
(@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
461 &palignr
(@Tx[0],@X[-2&7],8); # compose "X[-6]"
464 eval(shift(@insns)); # rol
466 &pxor
(@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
468 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
470 &movdqa
(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
471 } else { # ... or load next one
472 &movdqa
(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
474 &paddd
(@Tx[1],@X[-1&7]);
475 eval(shift(@insns)); # ror
478 &pxor
(@X[0],@Tx[0]); # "X[0]"^="X[-6]"
479 eval(shift(@insns)); # body_20_39
482 eval(shift(@insns)); # rol
484 &movdqa
(@Tx[0],@X[0]);
485 &movdqa
(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
488 eval(shift(@insns)); # ror
492 eval(shift(@insns)); # body_20_39
496 eval(shift(@insns)); # rol
499 eval(shift(@insns)); # ror
502 &por
(@X[0],@Tx[0]); # "X[0]"<<<=2
503 eval(shift(@insns)); # body_20_39
505 &movdqa
(@Tx[1],@X[0]) if ($Xi<19);
507 eval(shift(@insns)); # rol
510 eval(shift(@insns)); # rol
513 foreach (@insns) { eval; } # remaining instructions
515 $Xi++; push(@X,shift(@X)); # "rotate" X[]
516 push(@Tx,shift(@Tx));
519 sub Xuplast_ssse3_80
()
522 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
526 &paddd
(@Tx[1],@X[-1&7]);
532 &movdqa
(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
534 foreach (@insns) { eval; } # remaining instructions
537 &je
(".Ldone_ssse3");
539 unshift(@Tx,pop(@Tx));
541 &movdqa
(@X[2],"64($K_XX_XX)"); # pbswap mask
542 &movdqa
(@Tx[1],"0($K_XX_XX)"); # K_00_19
543 &movdqu
(@X[-4&7],"0($inp)"); # load input
544 &movdqu
(@X[-3&7],"16($inp)");
545 &movdqu
(@X[-2&7],"32($inp)");
546 &movdqu
(@X[-1&7],"48($inp)");
547 &pshufb
(@X[-4&7],@X[2]); # byte swap
556 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
561 &pshufb
(@X[($Xi-3)&7],@X[2]);
564 &paddd
(@X[($Xi-4)&7],@Tx[1]);
569 &movdqa
(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
572 &psubd
(@X[($Xi-4)&7],@Tx[1]);
574 foreach (@insns) { eval; }
581 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
584 foreach (@insns) { eval; }
589 '($a,$b,$c,$d,$e)=@V;'.
590 '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
592 '&mov (@T[1],$a);', # $b in next round
594 '&and (@T[0],$c);', # ($b&($c^$d))
595 '&xor ($c,$d);', # restore $c
598 '&$_ror ($b,$j?7:2);', # $b>>>2
599 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
605 '($a,$b,$c,$d,$e)=@V;'.
606 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
607 '&xor (@T[0],$d);', # ($b^$d)
608 '&mov (@T[1],$a);', # $b in next round
610 '&xor (@T[0],$c);', # ($b^$d^$c)
612 '&$_ror ($b,7);', # $b>>>2
613 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
619 '($a,$b,$c,$d,$e)=@V;'.
622 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
624 '&and (@T[0],$c);', # ($b&($c^$d))
625 '&$_ror ($b,7);', # $b>>>2
627 '&mov (@T[1],$a);', # $b in next round
630 '&xor ($c,$d);', # restore $c
631 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
638 &Xupdate_ssse3_16_31
(\
&body_00_19
);
639 &Xupdate_ssse3_16_31
(\
&body_00_19
);
640 &Xupdate_ssse3_16_31
(\
&body_00_19
);
641 &Xupdate_ssse3_16_31
(\
&body_00_19
);
642 &Xupdate_ssse3_32_79
(\
&body_00_19
);
643 &Xupdate_ssse3_32_79
(\
&body_20_39
);
644 &Xupdate_ssse3_32_79
(\
&body_20_39
);
645 &Xupdate_ssse3_32_79
(\
&body_20_39
);
646 &Xupdate_ssse3_32_79
(\
&body_20_39
);
647 &Xupdate_ssse3_32_79
(\
&body_20_39
);
648 &Xupdate_ssse3_32_79
(\
&body_40_59
);
649 &Xupdate_ssse3_32_79
(\
&body_40_59
);
650 &Xupdate_ssse3_32_79
(\
&body_40_59
);
651 &Xupdate_ssse3_32_79
(\
&body_40_59
);
652 &Xupdate_ssse3_32_79
(\
&body_40_59
);
653 &Xupdate_ssse3_32_79
(\
&body_20_39
);
654 &Xuplast_ssse3_80
(\
&body_20_39
); # can jump to "done"
656 $saved_j=$j; @saved_V=@V;
658 &Xloop_ssse3
(\
&body_20_39
);
659 &Xloop_ssse3
(\
&body_20_39
);
660 &Xloop_ssse3
(\
&body_20_39
);
663 add
0($ctx),$A # update context
670 mov
@T[0],$B # magic seed
679 $j=$saved_j; @V=@saved_V;
681 &Xtail_ssse3
(\
&body_20_39
);
682 &Xtail_ssse3
(\
&body_20_39
);
683 &Xtail_ssse3
(\
&body_20_39
);
686 add
0($ctx),$A # update context
697 $code.=<<___
if ($win64);
698 movaps
64+0(%rsp),%xmm6
699 movaps
64+16(%rsp),%xmm7
700 movaps
64+32(%rsp),%xmm8
701 movaps
64+48(%rsp),%xmm9
702 movaps
64+64(%rsp),%xmm10
705 lea
`64+($win64?5*16:0)`(%rsp),%rsi
712 .size sha1_block_data_order_ssse3
,.-sha1_block_data_order_ssse3
717 my @X=map("%xmm$_",(4..7,0..3));
718 my @Tx=map("%xmm$_",(8..10));
719 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
720 my @T=("%esi","%edi");
724 my $_rol=sub { &shld
(@_[0],@_) };
725 my $_ror=sub { &shrd
(@_[0],@_) };
728 .type sha1_block_data_order_avx
,\
@function,3
730 sha1_block_data_order_avx
:
735 lea
`-64-($win64?5*16:0)`(%rsp),%rsp
737 $code.=<<___
if ($win64);
738 movaps
%xmm6,64+0(%rsp)
739 movaps
%xmm7,64+16(%rsp)
740 movaps
%xmm8,64+32(%rsp)
741 movaps
%xmm9,64+48(%rsp)
742 movaps
%xmm10,64+64(%rsp)
746 mov
%rdi,$ctx # reassigned argument
747 mov
%rsi,$inp # reassigned argument
748 mov
%rdx,$num # reassigned argument
753 lea K_XX_XX
(%rip),$K_XX_XX
755 mov
0($ctx),$A # load context
759 mov
$B,@T[0] # magic seed
762 vmovdqa
64($K_XX_XX),@X[2] # pbswap mask
763 vmovdqa
0($K_XX_XX),@Tx[1] # K_00_19
764 vmovdqu
0($inp),@X[-4&7] # load input to %xmm[0-3]
765 vmovdqu
16($inp),@X[-3&7]
766 vmovdqu
32($inp),@X[-2&7]
767 vmovdqu
48($inp),@X[-1&7]
768 vpshufb
@X[2],@X[-4&7],@X[-4&7] # byte swap
770 vpshufb
@X[2],@X[-3&7],@X[-3&7]
771 vpshufb
@X[2],@X[-2&7],@X[-2&7]
772 vpshufb
@X[2],@X[-1&7],@X[-1&7]
773 vpaddd
@Tx[1],@X[-4&7],@X[0] # add K_00_19
774 vpaddd
@Tx[1],@X[-3&7],@X[1]
775 vpaddd
@Tx[1],@X[-2&7],@X[2]
776 vmovdqa
@X[0],0(%rsp) # X[]+K xfer to IALU
777 vmovdqa
@X[1],16(%rsp)
778 vmovdqa
@X[2],32(%rsp)
782 sub Xupdate_avx_16_31
() # recall that $Xi starts wtih 4
785 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
790 &vpalignr
(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
794 &vpaddd
(@Tx[1],@Tx[1],@X[-1&7]);
797 &vpsrldq
(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
800 &vpxor
(@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
804 &vpxor
(@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
810 &vpxor
(@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
813 &vmovdqa
(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
817 &vpsrld
(@Tx[0],@X[0],31);
823 &vpslldq
(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
824 &vpaddd
(@X[0],@X[0],@X[0]);
830 &vpsrld
(@Tx[1],@Tx[2],30);
831 &vpor
(@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
837 &vpslld
(@Tx[2],@Tx[2],2);
838 &vpxor
(@X[0],@X[0],@Tx[1]);
844 &vpxor
(@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
847 &vmovdqa
(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
852 foreach (@insns) { eval; } # remaining instructions [if any]
854 $Xi++; push(@X,shift(@X)); # "rotate" X[]
855 push(@Tx,shift(@Tx));
858 sub Xupdate_avx_32_79
()
861 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
864 &vpalignr
(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
865 &vpxor
(@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
866 eval(shift(@insns)); # body_20_39
869 eval(shift(@insns)); # rol
871 &vpxor
(@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
873 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
875 &vmovdqa
(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
876 } else { # ... or load next one
877 &vmovdqa
(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
879 &vpaddd
(@Tx[1],@Tx[1],@X[-1&7]);
880 eval(shift(@insns)); # ror
883 &vpxor
(@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
884 eval(shift(@insns)); # body_20_39
887 eval(shift(@insns)); # rol
889 &vpsrld
(@Tx[0],@X[0],30);
890 &vmovdqa
(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
893 eval(shift(@insns)); # ror
896 &vpslld
(@X[0],@X[0],2);
897 eval(shift(@insns)); # body_20_39
900 eval(shift(@insns)); # rol
903 eval(shift(@insns)); # ror
906 &vpor
(@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
907 eval(shift(@insns)); # body_20_39
909 &vmovdqa
(@Tx[1],@X[0]) if ($Xi<19);
911 eval(shift(@insns)); # rol
914 eval(shift(@insns)); # rol
917 foreach (@insns) { eval; } # remaining instructions
919 $Xi++; push(@X,shift(@X)); # "rotate" X[]
920 push(@Tx,shift(@Tx));
926 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
930 &vpaddd
(@Tx[1],@Tx[1],@X[-1&7]);
936 &movdqa
(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
938 foreach (@insns) { eval; } # remaining instructions
943 unshift(@Tx,pop(@Tx));
945 &vmovdqa
(@X[2],"64($K_XX_XX)"); # pbswap mask
946 &vmovdqa
(@Tx[1],"0($K_XX_XX)"); # K_00_19
947 &vmovdqu
(@X[-4&7],"0($inp)"); # load input
948 &vmovdqu
(@X[-3&7],"16($inp)");
949 &vmovdqu
(@X[-2&7],"32($inp)");
950 &vmovdqu
(@X[-1&7],"48($inp)");
951 &vpshufb
(@X[-4&7],@X[-4&7],@X[2]); # byte swap
960 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
965 &vpshufb
(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
968 &vpaddd
(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
973 &vmovdqa
(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
977 foreach (@insns) { eval; }
984 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
987 foreach (@insns) { eval; }
994 &Xupdate_avx_16_31
(\
&body_00_19
);
995 &Xupdate_avx_16_31
(\
&body_00_19
);
996 &Xupdate_avx_16_31
(\
&body_00_19
);
997 &Xupdate_avx_16_31
(\
&body_00_19
);
998 &Xupdate_avx_32_79
(\
&body_00_19
);
999 &Xupdate_avx_32_79
(\
&body_20_39
);
1000 &Xupdate_avx_32_79
(\
&body_20_39
);
1001 &Xupdate_avx_32_79
(\
&body_20_39
);
1002 &Xupdate_avx_32_79
(\
&body_20_39
);
1003 &Xupdate_avx_32_79
(\
&body_20_39
);
1004 &Xupdate_avx_32_79
(\
&body_40_59
);
1005 &Xupdate_avx_32_79
(\
&body_40_59
);
1006 &Xupdate_avx_32_79
(\
&body_40_59
);
1007 &Xupdate_avx_32_79
(\
&body_40_59
);
1008 &Xupdate_avx_32_79
(\
&body_40_59
);
1009 &Xupdate_avx_32_79
(\
&body_20_39
);
1010 &Xuplast_avx_80
(\
&body_20_39
); # can jump to "done"
1012 $saved_j=$j; @saved_V=@V;
1014 &Xloop_avx
(\
&body_20_39
);
1015 &Xloop_avx
(\
&body_20_39
);
1016 &Xloop_avx
(\
&body_20_39
);
1019 add
0($ctx),$A # update context
1026 mov
@T[0],$B # magic seed
1035 $j=$saved_j; @V=@saved_V;
1037 &Xtail_avx
(\
&body_20_39
);
1038 &Xtail_avx
(\
&body_20_39
);
1039 &Xtail_avx
(\
&body_20_39
);
1044 add
0($ctx),$A # update context
1055 $code.=<<___
if ($win64);
1056 movaps
64+0(%rsp),%xmm6
1057 movaps
64+16(%rsp),%xmm7
1058 movaps
64+32(%rsp),%xmm8
1059 movaps
64+48(%rsp),%xmm9
1060 movaps
64+64(%rsp),%xmm10
1063 lea
`64+($win64?5*16:0)`(%rsp),%rsi
1070 .size sha1_block_data_order_avx
,.-sha1_block_data_order_avx
1076 .long
0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1077 .long
0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1078 .long
0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1079 .long
0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1080 .long
0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
1084 .asciz
"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1088 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1089 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1097 .extern __imp_RtlVirtualUnwind
1098 .type se_handler
,\
@abi-omnipotent
1112 mov
120($context),%rax # pull context->Rax
1113 mov
248($context),%rbx # pull context->Rip
1115 lea
.Lprologue
(%rip),%r10
1116 cmp %r10,%rbx # context->Rip<.Lprologue
1117 jb
.Lcommon_seh_tail
1119 mov
152($context),%rax # pull context->Rsp
1121 lea
.Lepilogue
(%rip),%r10
1122 cmp %r10,%rbx # context->Rip>=.Lepilogue
1123 jae
.Lcommon_seh_tail
1125 mov
`16*4`(%rax),%rax # pull saved stack pointer
1132 mov
%rbx,144($context) # restore context->Rbx
1133 mov
%rbp,160($context) # restore context->Rbp
1134 mov
%r12,216($context) # restore context->R12
1135 mov
%r13,224($context) # restore context->R13
1137 jmp
.Lcommon_seh_tail
1138 .size se_handler
,.-se_handler
1140 .type ssse3_handler
,\
@abi-omnipotent
1154 mov
120($context),%rax # pull context->Rax
1155 mov
248($context),%rbx # pull context->Rip
1157 mov
8($disp),%rsi # disp->ImageBase
1158 mov
56($disp),%r11 # disp->HandlerData
1160 mov
0(%r11),%r10d # HandlerData[0]
1161 lea
(%rsi,%r10),%r10 # prologue label
1162 cmp %r10,%rbx # context->Rip<prologue label
1163 jb
.Lcommon_seh_tail
1165 mov
152($context),%rax # pull context->Rsp
1167 mov
4(%r11),%r10d # HandlerData[1]
1168 lea
(%rsi,%r10),%r10 # epilogue label
1169 cmp %r10,%rbx # context->Rip>=epilogue label
1170 jae
.Lcommon_seh_tail
1173 lea
512($context),%rdi # &context.Xmm6
1175 .long
0xa548f3fc # cld; rep movsq
1176 lea
`24+64+5*16`(%rax),%rax # adjust stack pointer
1181 mov
%rbx,144($context) # restore context->Rbx
1182 mov
%rbp,160($context) # restore context->Rbp
1183 mov
%r12,216($context) # restore cotnext->R12
1188 mov
%rax,152($context) # restore context->Rsp
1189 mov
%rsi,168($context) # restore context->Rsi
1190 mov
%rdi,176($context) # restore context->Rdi
1192 mov
40($disp),%rdi # disp->ContextRecord
1193 mov
$context,%rsi # context
1194 mov \
$154,%ecx # sizeof(CONTEXT)
1195 .long
0xa548f3fc # cld; rep movsq
1198 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1199 mov
8(%rsi),%rdx # arg2, disp->ImageBase
1200 mov
0(%rsi),%r8 # arg3, disp->ControlPc
1201 mov
16(%rsi),%r9 # arg4, disp->FunctionEntry
1202 mov
40(%rsi),%r10 # disp->ContextRecord
1203 lea
56(%rsi),%r11 # &disp->HandlerData
1204 lea
24(%rsi),%r12 # &disp->EstablisherFrame
1205 mov
%r10,32(%rsp) # arg5
1206 mov
%r11,40(%rsp) # arg6
1207 mov
%r12,48(%rsp) # arg7
1208 mov
%rcx,56(%rsp) # arg8, (NULL)
1209 call
*__imp_RtlVirtualUnwind
(%rip)
1211 mov \
$1,%eax # ExceptionContinueSearch
1223 .size ssse3_handler
,.-ssse3_handler
1227 .rva
.LSEH_begin_sha1_block_data_order
1228 .rva
.LSEH_end_sha1_block_data_order
1229 .rva
.LSEH_info_sha1_block_data_order
1230 .rva
.LSEH_begin_sha1_block_data_order_ssse3
1231 .rva
.LSEH_end_sha1_block_data_order_ssse3
1232 .rva
.LSEH_info_sha1_block_data_order_ssse3
1234 $code.=<<___
if ($avx);
1235 .rva
.LSEH_begin_sha1_block_data_order_avx
1236 .rva
.LSEH_end_sha1_block_data_order_avx
1237 .rva
.LSEH_info_sha1_block_data_order_avx
1242 .LSEH_info_sha1_block_data_order
:
1245 .LSEH_info_sha1_block_data_order_ssse3
:
1248 .rva
.Lprologue_ssse3
,.Lepilogue_ssse3
# HandlerData[]
1250 $code.=<<___
if ($avx);
1251 .LSEH_info_sha1_block_data_order_avx
:
1254 .rva
.Lprologue_avx
,.Lepilogue_avx
# HandlerData[]
1258 ####################################################################
1260 $code =~ s/\`([^\`]*)\`/eval $1/gem;