import libcrypto (LibreSSL 2.5.2)
[unleashed.git] / lib / libcrypto / sha / asm / sha1-x86_64.pl
blob147d21570bc94606fbd9353f6bfe966d03e286b3
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # sha1_block procedure for x86_64.
12 # It was brought to my attention that on EM64T compiler-generated code
13 # was far behind 32-bit assembler implementation. This is unlike on
14 # Opteron where compiler-generated code was only 15% behind 32-bit
15 # assembler, which originally made it hard to motivate the effort.
16 # There was suggestion to mechanically translate 32-bit code, but I
17 # dismissed it, reasoning that x86_64 offers enough register bank
18 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19 # implementation:-) However! While 64-bit code does perform better
20 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21 # x86_64 does offer larger *addressable* bank, but out-of-order core
22 # reaches for even more registers through dynamic aliasing, and EM64T
23 # core must have managed to run-time optimize even 32-bit code just as
24 # good as 64-bit one. Performance improvement is summarized in the
25 # following table:
27 # gcc 3.4 32-bit asm cycles/byte
28 # Opteron +45% +20% 6.8
29 # Xeon P4 +65% +0% 9.9
30 # Core2 +60% +10% 7.0
32 # August 2009.
34 # The code was revised to minimize code size and to maximize
35 # "distance" between instructions producing input to 'lea'
36 # instruction and the 'lea' instruction itself, which is essential
37 # for Intel Atom core.
39 # October 2010.
41 # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42 # is to offload message schedule denoted by Wt in NIST specification,
43 # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44 # for background and implementation details. The only difference from
45 # 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46 # to free temporary registers.
48 # April 2011.
50 # Add AVX code path. See sha1-586.pl for further information.
52 ######################################################################
53 # Current performance is summarized in following table. Numbers are
54 # CPU clock cycles spent to process single byte (less is better).
56 # x86_64 SSSE3 AVX
57 # P4 9.8 -
58 # Opteron 6.6 -
59 # Core2 6.7 6.1/+10% -
60 # Atom 11.0 9.7/+13% -
61 # Westmere 7.1 5.6/+27% -
62 # Sandy Bridge 7.9 6.3/+25% 5.2/+51%
64 $flavour = shift;
65 $output = shift;
66 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
68 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
70 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
72 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
73 die "can't locate x86_64-xlate.pl";
75 $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
76 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
77 $1>=2.19);
78 $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
80 $1>=2.09);
81 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
82 `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
83 $1>=10);
85 open OUT,"| \"$^X\" $xlate $flavour $output";
86 *STDOUT=*OUT;
88 $ctx="%rdi"; # 1st arg
89 $inp="%rsi"; # 2nd arg
90 $num="%rdx"; # 3rd arg
92 # reassign arguments in order to produce more compact code
93 $ctx="%r8";
94 $inp="%r9";
95 $num="%r10";
97 $t0="%eax";
98 $t1="%ebx";
99 $t2="%ecx";
100 @xi=("%edx","%ebp");
101 $A="%esi";
102 $B="%edi";
103 $C="%r11d";
104 $D="%r12d";
105 $E="%r13d";
107 @V=($A,$B,$C,$D,$E);
109 sub BODY_00_19 {
110 my ($i,$a,$b,$c,$d,$e)=@_;
111 my $j=$i+1;
112 $code.=<<___ if ($i==0);
113 mov `4*$i`($inp),$xi[0]
114 bswap $xi[0]
115 mov $xi[0],`4*$i`(%rsp)
117 $code.=<<___ if ($i<15);
118 mov $c,$t0
119 mov `4*$j`($inp),$xi[1]
120 mov $a,$t2
121 xor $d,$t0
122 bswap $xi[1]
123 rol \$5,$t2
124 lea 0x5a827999($xi[0],$e),$e
125 and $b,$t0
126 mov $xi[1],`4*$j`(%rsp)
127 add $t2,$e
128 xor $d,$t0
129 rol \$30,$b
130 add $t0,$e
132 $code.=<<___ if ($i>=15);
133 mov `4*($j%16)`(%rsp),$xi[1]
134 mov $c,$t0
135 mov $a,$t2
136 xor `4*(($j+2)%16)`(%rsp),$xi[1]
137 xor $d,$t0
138 rol \$5,$t2
139 xor `4*(($j+8)%16)`(%rsp),$xi[1]
140 and $b,$t0
141 lea 0x5a827999($xi[0],$e),$e
142 xor `4*(($j+13)%16)`(%rsp),$xi[1]
143 xor $d,$t0
144 rol \$1,$xi[1]
145 add $t2,$e
146 rol \$30,$b
147 mov $xi[1],`4*($j%16)`(%rsp)
148 add $t0,$e
150 unshift(@xi,pop(@xi));
153 sub BODY_20_39 {
154 my ($i,$a,$b,$c,$d,$e)=@_;
155 my $j=$i+1;
156 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
157 $code.=<<___ if ($i<79);
158 mov `4*($j%16)`(%rsp),$xi[1]
159 mov $c,$t0
160 mov $a,$t2
161 xor `4*(($j+2)%16)`(%rsp),$xi[1]
162 xor $b,$t0
163 rol \$5,$t2
164 lea $K($xi[0],$e),$e
165 xor `4*(($j+8)%16)`(%rsp),$xi[1]
166 xor $d,$t0
167 add $t2,$e
168 xor `4*(($j+13)%16)`(%rsp),$xi[1]
169 rol \$30,$b
170 add $t0,$e
171 rol \$1,$xi[1]
173 $code.=<<___ if ($i<76);
174 mov $xi[1],`4*($j%16)`(%rsp)
176 $code.=<<___ if ($i==79);
177 mov $c,$t0
178 mov $a,$t2
179 xor $b,$t0
180 lea $K($xi[0],$e),$e
181 rol \$5,$t2
182 xor $d,$t0
183 add $t2,$e
184 rol \$30,$b
185 add $t0,$e
187 unshift(@xi,pop(@xi));
190 sub BODY_40_59 {
191 my ($i,$a,$b,$c,$d,$e)=@_;
192 my $j=$i+1;
193 $code.=<<___;
194 mov `4*($j%16)`(%rsp),$xi[1]
195 mov $c,$t0
196 mov $c,$t1
197 xor `4*(($j+2)%16)`(%rsp),$xi[1]
198 and $d,$t0
199 mov $a,$t2
200 xor `4*(($j+8)%16)`(%rsp),$xi[1]
201 xor $d,$t1
202 lea 0x8f1bbcdc($xi[0],$e),$e
203 rol \$5,$t2
204 xor `4*(($j+13)%16)`(%rsp),$xi[1]
205 add $t0,$e
206 and $b,$t1
207 rol \$1,$xi[1]
208 add $t1,$e
209 rol \$30,$b
210 mov $xi[1],`4*($j%16)`(%rsp)
211 add $t2,$e
213 unshift(@xi,pop(@xi));
216 $code.=<<___;
217 .text
218 .extern OPENSSL_ia32cap_P
219 .hidden OPENSSL_ia32cap_P
221 .globl sha1_block_data_order
222 .type sha1_block_data_order,\@function,3
223 .align 16
224 sha1_block_data_order:
225 mov OPENSSL_ia32cap_P+0(%rip),%r9d
226 mov OPENSSL_ia32cap_P+4(%rip),%r8d
227 test \$IA32CAP_MASK1_SSSE3,%r8d # check SSSE3 bit
228 jz .Lialu
230 $code.=<<___ if ($avx);
231 and \$IA32CAP_MASK1_AVX,%r8d # mask AVX bit
232 and \$IA32CAP_MASK0_INTEL,%r9d # mask "Intel CPU" bit
233 or %r9d,%r8d
234 cmp \$(IA32CAP_MASK0_INTEL | IA32CAP_MASK1_AVX),%r8d
235 je _avx_shortcut
237 $code.=<<___;
238 jmp _ssse3_shortcut
240 .align 16
241 .Lialu:
242 push %rbx
243 push %rbp
244 push %r12
245 push %r13
246 mov %rsp,%r11
247 mov %rdi,$ctx # reassigned argument
248 sub \$`8+16*4`,%rsp
249 mov %rsi,$inp # reassigned argument
250 and \$-64,%rsp
251 mov %rdx,$num # reassigned argument
252 mov %r11,`16*4`(%rsp)
253 .Lprologue:
255 mov 0($ctx),$A
256 mov 4($ctx),$B
257 mov 8($ctx),$C
258 mov 12($ctx),$D
259 mov 16($ctx),$E
260 jmp .Lloop
262 .align 16
263 .Lloop:
265 for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
266 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
267 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
268 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
269 $code.=<<___;
270 add 0($ctx),$A
271 add 4($ctx),$B
272 add 8($ctx),$C
273 add 12($ctx),$D
274 add 16($ctx),$E
275 mov $A,0($ctx)
276 mov $B,4($ctx)
277 mov $C,8($ctx)
278 mov $D,12($ctx)
279 mov $E,16($ctx)
281 sub \$1,$num
282 lea `16*4`($inp),$inp
283 jnz .Lloop
285 mov `16*4`(%rsp),%rsi
286 mov (%rsi),%r13
287 mov 8(%rsi),%r12
288 mov 16(%rsi),%rbp
289 mov 24(%rsi),%rbx
290 lea 32(%rsi),%rsp
291 .Lepilogue:
293 .size sha1_block_data_order,.-sha1_block_data_order
296 my $Xi=4;
297 my @X=map("%xmm$_",(4..7,0..3));
298 my @Tx=map("%xmm$_",(8..10));
299 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
300 my @T=("%esi","%edi");
301 my $j=0;
302 my $K_XX_XX="%r11";
304 my $_rol=sub { &rol(@_) };
305 my $_ror=sub { &ror(@_) };
307 $code.=<<___;
308 .type sha1_block_data_order_ssse3,\@function,3
309 .align 16
310 sha1_block_data_order_ssse3:
311 _ssse3_shortcut:
312 push %rbx
313 push %rbp
314 push %r12
315 lea `-64-($win64?5*16:0)`(%rsp),%rsp
317 $code.=<<___ if ($win64);
318 movaps %xmm6,64+0(%rsp)
319 movaps %xmm7,64+16(%rsp)
320 movaps %xmm8,64+32(%rsp)
321 movaps %xmm9,64+48(%rsp)
322 movaps %xmm10,64+64(%rsp)
323 .Lprologue_ssse3:
325 $code.=<<___;
326 mov %rdi,$ctx # reassigned argument
327 mov %rsi,$inp # reassigned argument
328 mov %rdx,$num # reassigned argument
330 shl \$6,$num
331 add $inp,$num
332 lea K_XX_XX(%rip),$K_XX_XX
334 mov 0($ctx),$A # load context
335 mov 4($ctx),$B
336 mov 8($ctx),$C
337 mov 12($ctx),$D
338 mov $B,@T[0] # magic seed
339 mov 16($ctx),$E
341 movdqa 64($K_XX_XX),@X[2] # pbswap mask
342 movdqa 0($K_XX_XX),@Tx[1] # K_00_19
343 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
344 movdqu 16($inp),@X[-3&7]
345 movdqu 32($inp),@X[-2&7]
346 movdqu 48($inp),@X[-1&7]
347 pshufb @X[2],@X[-4&7] # byte swap
348 add \$64,$inp
349 pshufb @X[2],@X[-3&7]
350 pshufb @X[2],@X[-2&7]
351 pshufb @X[2],@X[-1&7]
352 paddd @Tx[1],@X[-4&7] # add K_00_19
353 paddd @Tx[1],@X[-3&7]
354 paddd @Tx[1],@X[-2&7]
355 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
356 psubd @Tx[1],@X[-4&7] # restore X[]
357 movdqa @X[-3&7],16(%rsp)
358 psubd @Tx[1],@X[-3&7]
359 movdqa @X[-2&7],32(%rsp)
360 psubd @Tx[1],@X[-2&7]
361 jmp .Loop_ssse3
364 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
365 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
366 my $arg = pop;
367 $arg = "\$$arg" if ($arg*1 eq $arg);
368 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
371 sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
372 { use integer;
373 my $body = shift;
374 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
375 my ($a,$b,$c,$d,$e);
377 &movdqa (@X[0],@X[-3&7]);
378 eval(shift(@insns));
379 eval(shift(@insns));
380 &movdqa (@Tx[0],@X[-1&7]);
381 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
382 eval(shift(@insns));
383 eval(shift(@insns));
385 &paddd (@Tx[1],@X[-1&7]);
386 eval(shift(@insns));
387 eval(shift(@insns));
388 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
389 eval(shift(@insns));
390 eval(shift(@insns));
391 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
392 eval(shift(@insns));
393 eval(shift(@insns));
395 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
396 eval(shift(@insns));
397 eval(shift(@insns));
398 eval(shift(@insns));
399 eval(shift(@insns));
401 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
402 eval(shift(@insns));
403 eval(shift(@insns));
404 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
405 eval(shift(@insns));
406 eval(shift(@insns));
408 &movdqa (@Tx[2],@X[0]);
409 &movdqa (@Tx[0],@X[0]);
410 eval(shift(@insns));
411 eval(shift(@insns));
412 eval(shift(@insns));
413 eval(shift(@insns));
415 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
416 &paddd (@X[0],@X[0]);
417 eval(shift(@insns));
418 eval(shift(@insns));
419 eval(shift(@insns));
420 eval(shift(@insns));
422 &psrld (@Tx[0],31);
423 eval(shift(@insns));
424 eval(shift(@insns));
425 &movdqa (@Tx[1],@Tx[2]);
426 eval(shift(@insns));
427 eval(shift(@insns));
429 &psrld (@Tx[2],30);
430 &por (@X[0],@Tx[0]); # "X[0]"<<<=1
431 eval(shift(@insns));
432 eval(shift(@insns));
433 eval(shift(@insns));
434 eval(shift(@insns));
436 &pslld (@Tx[1],2);
437 &pxor (@X[0],@Tx[2]);
438 eval(shift(@insns));
439 eval(shift(@insns));
440 &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
441 eval(shift(@insns));
442 eval(shift(@insns));
444 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
446 foreach (@insns) { eval; } # remaining instructions [if any]
448 $Xi++; push(@X,shift(@X)); # "rotate" X[]
449 push(@Tx,shift(@Tx));
452 sub Xupdate_ssse3_32_79()
453 { use integer;
454 my $body = shift;
455 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
456 my ($a,$b,$c,$d,$e);
458 &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
459 eval(shift(@insns)); # body_20_39
460 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
461 &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
462 eval(shift(@insns));
463 eval(shift(@insns));
464 eval(shift(@insns)); # rol
466 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
467 eval(shift(@insns));
468 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
469 if ($Xi%5) {
470 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
471 } else { # ... or load next one
472 &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
474 &paddd (@Tx[1],@X[-1&7]);
475 eval(shift(@insns)); # ror
476 eval(shift(@insns));
478 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
479 eval(shift(@insns)); # body_20_39
480 eval(shift(@insns));
481 eval(shift(@insns));
482 eval(shift(@insns)); # rol
484 &movdqa (@Tx[0],@X[0]);
485 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
486 eval(shift(@insns));
487 eval(shift(@insns));
488 eval(shift(@insns)); # ror
489 eval(shift(@insns));
491 &pslld (@X[0],2);
492 eval(shift(@insns)); # body_20_39
493 eval(shift(@insns));
494 &psrld (@Tx[0],30);
495 eval(shift(@insns));
496 eval(shift(@insns)); # rol
497 eval(shift(@insns));
498 eval(shift(@insns));
499 eval(shift(@insns)); # ror
500 eval(shift(@insns));
502 &por (@X[0],@Tx[0]); # "X[0]"<<<=2
503 eval(shift(@insns)); # body_20_39
504 eval(shift(@insns));
505 &movdqa (@Tx[1],@X[0]) if ($Xi<19);
506 eval(shift(@insns));
507 eval(shift(@insns)); # rol
508 eval(shift(@insns));
509 eval(shift(@insns));
510 eval(shift(@insns)); # rol
511 eval(shift(@insns));
513 foreach (@insns) { eval; } # remaining instructions
515 $Xi++; push(@X,shift(@X)); # "rotate" X[]
516 push(@Tx,shift(@Tx));
519 sub Xuplast_ssse3_80()
520 { use integer;
521 my $body = shift;
522 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
523 my ($a,$b,$c,$d,$e);
525 eval(shift(@insns));
526 &paddd (@Tx[1],@X[-1&7]);
527 eval(shift(@insns));
528 eval(shift(@insns));
529 eval(shift(@insns));
530 eval(shift(@insns));
532 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
534 foreach (@insns) { eval; } # remaining instructions
536 &cmp ($inp,$num);
537 &je (".Ldone_ssse3");
539 unshift(@Tx,pop(@Tx));
541 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
542 &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
543 &movdqu (@X[-4&7],"0($inp)"); # load input
544 &movdqu (@X[-3&7],"16($inp)");
545 &movdqu (@X[-2&7],"32($inp)");
546 &movdqu (@X[-1&7],"48($inp)");
547 &pshufb (@X[-4&7],@X[2]); # byte swap
548 &add ($inp,64);
550 $Xi=0;
553 sub Xloop_ssse3()
554 { use integer;
555 my $body = shift;
556 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
557 my ($a,$b,$c,$d,$e);
559 eval(shift(@insns));
560 eval(shift(@insns));
561 &pshufb (@X[($Xi-3)&7],@X[2]);
562 eval(shift(@insns));
563 eval(shift(@insns));
564 &paddd (@X[($Xi-4)&7],@Tx[1]);
565 eval(shift(@insns));
566 eval(shift(@insns));
567 eval(shift(@insns));
568 eval(shift(@insns));
569 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
570 eval(shift(@insns));
571 eval(shift(@insns));
572 &psubd (@X[($Xi-4)&7],@Tx[1]);
574 foreach (@insns) { eval; }
575 $Xi++;
578 sub Xtail_ssse3()
579 { use integer;
580 my $body = shift;
581 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
582 my ($a,$b,$c,$d,$e);
584 foreach (@insns) { eval; }
587 sub body_00_19 () {
589 '($a,$b,$c,$d,$e)=@V;'.
590 '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
591 '&xor ($c,$d);',
592 '&mov (@T[1],$a);', # $b in next round
593 '&$_rol ($a,5);',
594 '&and (@T[0],$c);', # ($b&($c^$d))
595 '&xor ($c,$d);', # restore $c
596 '&xor (@T[0],$d);',
597 '&add ($e,$a);',
598 '&$_ror ($b,$j?7:2);', # $b>>>2
599 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
603 sub body_20_39 () {
605 '($a,$b,$c,$d,$e)=@V;'.
606 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
607 '&xor (@T[0],$d);', # ($b^$d)
608 '&mov (@T[1],$a);', # $b in next round
609 '&$_rol ($a,5);',
610 '&xor (@T[0],$c);', # ($b^$d^$c)
611 '&add ($e,$a);',
612 '&$_ror ($b,7);', # $b>>>2
613 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
617 sub body_40_59 () {
619 '($a,$b,$c,$d,$e)=@V;'.
620 '&mov (@T[1],$c);',
621 '&xor ($c,$d);',
622 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
623 '&and (@T[1],$d);',
624 '&and (@T[0],$c);', # ($b&($c^$d))
625 '&$_ror ($b,7);', # $b>>>2
626 '&add ($e,@T[1]);',
627 '&mov (@T[1],$a);', # $b in next round
628 '&$_rol ($a,5);',
629 '&add ($e,@T[0]);',
630 '&xor ($c,$d);', # restore $c
631 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
634 $code.=<<___;
635 .align 16
636 .Loop_ssse3:
638 &Xupdate_ssse3_16_31(\&body_00_19);
639 &Xupdate_ssse3_16_31(\&body_00_19);
640 &Xupdate_ssse3_16_31(\&body_00_19);
641 &Xupdate_ssse3_16_31(\&body_00_19);
642 &Xupdate_ssse3_32_79(\&body_00_19);
643 &Xupdate_ssse3_32_79(\&body_20_39);
644 &Xupdate_ssse3_32_79(\&body_20_39);
645 &Xupdate_ssse3_32_79(\&body_20_39);
646 &Xupdate_ssse3_32_79(\&body_20_39);
647 &Xupdate_ssse3_32_79(\&body_20_39);
648 &Xupdate_ssse3_32_79(\&body_40_59);
649 &Xupdate_ssse3_32_79(\&body_40_59);
650 &Xupdate_ssse3_32_79(\&body_40_59);
651 &Xupdate_ssse3_32_79(\&body_40_59);
652 &Xupdate_ssse3_32_79(\&body_40_59);
653 &Xupdate_ssse3_32_79(\&body_20_39);
654 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
656 $saved_j=$j; @saved_V=@V;
658 &Xloop_ssse3(\&body_20_39);
659 &Xloop_ssse3(\&body_20_39);
660 &Xloop_ssse3(\&body_20_39);
662 $code.=<<___;
663 add 0($ctx),$A # update context
664 add 4($ctx),@T[0]
665 add 8($ctx),$C
666 add 12($ctx),$D
667 mov $A,0($ctx)
668 add 16($ctx),$E
669 mov @T[0],4($ctx)
670 mov @T[0],$B # magic seed
671 mov $C,8($ctx)
672 mov $D,12($ctx)
673 mov $E,16($ctx)
674 jmp .Loop_ssse3
676 .align 16
677 .Ldone_ssse3:
679 $j=$saved_j; @V=@saved_V;
681 &Xtail_ssse3(\&body_20_39);
682 &Xtail_ssse3(\&body_20_39);
683 &Xtail_ssse3(\&body_20_39);
685 $code.=<<___;
686 add 0($ctx),$A # update context
687 add 4($ctx),@T[0]
688 add 8($ctx),$C
689 mov $A,0($ctx)
690 add 12($ctx),$D
691 mov @T[0],4($ctx)
692 add 16($ctx),$E
693 mov $C,8($ctx)
694 mov $D,12($ctx)
695 mov $E,16($ctx)
697 $code.=<<___ if ($win64);
698 movaps 64+0(%rsp),%xmm6
699 movaps 64+16(%rsp),%xmm7
700 movaps 64+32(%rsp),%xmm8
701 movaps 64+48(%rsp),%xmm9
702 movaps 64+64(%rsp),%xmm10
704 $code.=<<___;
705 lea `64+($win64?5*16:0)`(%rsp),%rsi
706 mov 0(%rsi),%r12
707 mov 8(%rsi),%rbp
708 mov 16(%rsi),%rbx
709 lea 24(%rsi),%rsp
710 .Lepilogue_ssse3:
712 .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
715 if ($avx) {
716 my $Xi=4;
717 my @X=map("%xmm$_",(4..7,0..3));
718 my @Tx=map("%xmm$_",(8..10));
719 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
720 my @T=("%esi","%edi");
721 my $j=0;
722 my $K_XX_XX="%r11";
724 my $_rol=sub { &shld(@_[0],@_) };
725 my $_ror=sub { &shrd(@_[0],@_) };
727 $code.=<<___;
728 .type sha1_block_data_order_avx,\@function,3
729 .align 16
730 sha1_block_data_order_avx:
731 _avx_shortcut:
732 push %rbx
733 push %rbp
734 push %r12
735 lea `-64-($win64?5*16:0)`(%rsp),%rsp
737 $code.=<<___ if ($win64);
738 movaps %xmm6,64+0(%rsp)
739 movaps %xmm7,64+16(%rsp)
740 movaps %xmm8,64+32(%rsp)
741 movaps %xmm9,64+48(%rsp)
742 movaps %xmm10,64+64(%rsp)
743 .Lprologue_avx:
745 $code.=<<___;
746 mov %rdi,$ctx # reassigned argument
747 mov %rsi,$inp # reassigned argument
748 mov %rdx,$num # reassigned argument
749 vzeroupper
751 shl \$6,$num
752 add $inp,$num
753 lea K_XX_XX(%rip),$K_XX_XX
755 mov 0($ctx),$A # load context
756 mov 4($ctx),$B
757 mov 8($ctx),$C
758 mov 12($ctx),$D
759 mov $B,@T[0] # magic seed
760 mov 16($ctx),$E
762 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
763 vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
764 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
765 vmovdqu 16($inp),@X[-3&7]
766 vmovdqu 32($inp),@X[-2&7]
767 vmovdqu 48($inp),@X[-1&7]
768 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
769 add \$64,$inp
770 vpshufb @X[2],@X[-3&7],@X[-3&7]
771 vpshufb @X[2],@X[-2&7],@X[-2&7]
772 vpshufb @X[2],@X[-1&7],@X[-1&7]
773 vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
774 vpaddd @Tx[1],@X[-3&7],@X[1]
775 vpaddd @Tx[1],@X[-2&7],@X[2]
776 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
777 vmovdqa @X[1],16(%rsp)
778 vmovdqa @X[2],32(%rsp)
779 jmp .Loop_avx
782 sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
783 { use integer;
784 my $body = shift;
785 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
786 my ($a,$b,$c,$d,$e);
788 eval(shift(@insns));
789 eval(shift(@insns));
790 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
791 eval(shift(@insns));
792 eval(shift(@insns));
794 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
795 eval(shift(@insns));
796 eval(shift(@insns));
797 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
798 eval(shift(@insns));
799 eval(shift(@insns));
800 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
801 eval(shift(@insns));
802 eval(shift(@insns));
804 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
805 eval(shift(@insns));
806 eval(shift(@insns));
807 eval(shift(@insns));
808 eval(shift(@insns));
810 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
811 eval(shift(@insns));
812 eval(shift(@insns));
813 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
814 eval(shift(@insns));
815 eval(shift(@insns));
817 &vpsrld (@Tx[0],@X[0],31);
818 eval(shift(@insns));
819 eval(shift(@insns));
820 eval(shift(@insns));
821 eval(shift(@insns));
823 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
824 &vpaddd (@X[0],@X[0],@X[0]);
825 eval(shift(@insns));
826 eval(shift(@insns));
827 eval(shift(@insns));
828 eval(shift(@insns));
830 &vpsrld (@Tx[1],@Tx[2],30);
831 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
832 eval(shift(@insns));
833 eval(shift(@insns));
834 eval(shift(@insns));
835 eval(shift(@insns));
837 &vpslld (@Tx[2],@Tx[2],2);
838 &vpxor (@X[0],@X[0],@Tx[1]);
839 eval(shift(@insns));
840 eval(shift(@insns));
841 eval(shift(@insns));
842 eval(shift(@insns));
844 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
845 eval(shift(@insns));
846 eval(shift(@insns));
847 &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
848 eval(shift(@insns));
849 eval(shift(@insns));
852 foreach (@insns) { eval; } # remaining instructions [if any]
854 $Xi++; push(@X,shift(@X)); # "rotate" X[]
855 push(@Tx,shift(@Tx));
858 sub Xupdate_avx_32_79()
859 { use integer;
860 my $body = shift;
861 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
862 my ($a,$b,$c,$d,$e);
864 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
865 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
866 eval(shift(@insns)); # body_20_39
867 eval(shift(@insns));
868 eval(shift(@insns));
869 eval(shift(@insns)); # rol
871 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
872 eval(shift(@insns));
873 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
874 if ($Xi%5) {
875 &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
876 } else { # ... or load next one
877 &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
879 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
880 eval(shift(@insns)); # ror
881 eval(shift(@insns));
883 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
884 eval(shift(@insns)); # body_20_39
885 eval(shift(@insns));
886 eval(shift(@insns));
887 eval(shift(@insns)); # rol
889 &vpsrld (@Tx[0],@X[0],30);
890 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
891 eval(shift(@insns));
892 eval(shift(@insns));
893 eval(shift(@insns)); # ror
894 eval(shift(@insns));
896 &vpslld (@X[0],@X[0],2);
897 eval(shift(@insns)); # body_20_39
898 eval(shift(@insns));
899 eval(shift(@insns));
900 eval(shift(@insns)); # rol
901 eval(shift(@insns));
902 eval(shift(@insns));
903 eval(shift(@insns)); # ror
904 eval(shift(@insns));
906 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
907 eval(shift(@insns)); # body_20_39
908 eval(shift(@insns));
909 &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
910 eval(shift(@insns));
911 eval(shift(@insns)); # rol
912 eval(shift(@insns));
913 eval(shift(@insns));
914 eval(shift(@insns)); # rol
915 eval(shift(@insns));
917 foreach (@insns) { eval; } # remaining instructions
919 $Xi++; push(@X,shift(@X)); # "rotate" X[]
920 push(@Tx,shift(@Tx));
923 sub Xuplast_avx_80()
924 { use integer;
925 my $body = shift;
926 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
927 my ($a,$b,$c,$d,$e);
929 eval(shift(@insns));
930 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
931 eval(shift(@insns));
932 eval(shift(@insns));
933 eval(shift(@insns));
934 eval(shift(@insns));
936 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
938 foreach (@insns) { eval; } # remaining instructions
940 &cmp ($inp,$num);
941 &je (".Ldone_avx");
943 unshift(@Tx,pop(@Tx));
945 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
946 &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
947 &vmovdqu(@X[-4&7],"0($inp)"); # load input
948 &vmovdqu(@X[-3&7],"16($inp)");
949 &vmovdqu(@X[-2&7],"32($inp)");
950 &vmovdqu(@X[-1&7],"48($inp)");
951 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
952 &add ($inp,64);
954 $Xi=0;
957 sub Xloop_avx()
958 { use integer;
959 my $body = shift;
960 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
961 my ($a,$b,$c,$d,$e);
963 eval(shift(@insns));
964 eval(shift(@insns));
965 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
966 eval(shift(@insns));
967 eval(shift(@insns));
968 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
969 eval(shift(@insns));
970 eval(shift(@insns));
971 eval(shift(@insns));
972 eval(shift(@insns));
973 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
974 eval(shift(@insns));
975 eval(shift(@insns));
977 foreach (@insns) { eval; }
978 $Xi++;
981 sub Xtail_avx()
982 { use integer;
983 my $body = shift;
984 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
985 my ($a,$b,$c,$d,$e);
987 foreach (@insns) { eval; }
990 $code.=<<___;
991 .align 16
992 .Loop_avx:
994 &Xupdate_avx_16_31(\&body_00_19);
995 &Xupdate_avx_16_31(\&body_00_19);
996 &Xupdate_avx_16_31(\&body_00_19);
997 &Xupdate_avx_16_31(\&body_00_19);
998 &Xupdate_avx_32_79(\&body_00_19);
999 &Xupdate_avx_32_79(\&body_20_39);
1000 &Xupdate_avx_32_79(\&body_20_39);
1001 &Xupdate_avx_32_79(\&body_20_39);
1002 &Xupdate_avx_32_79(\&body_20_39);
1003 &Xupdate_avx_32_79(\&body_20_39);
1004 &Xupdate_avx_32_79(\&body_40_59);
1005 &Xupdate_avx_32_79(\&body_40_59);
1006 &Xupdate_avx_32_79(\&body_40_59);
1007 &Xupdate_avx_32_79(\&body_40_59);
1008 &Xupdate_avx_32_79(\&body_40_59);
1009 &Xupdate_avx_32_79(\&body_20_39);
1010 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
1012 $saved_j=$j; @saved_V=@V;
1014 &Xloop_avx(\&body_20_39);
1015 &Xloop_avx(\&body_20_39);
1016 &Xloop_avx(\&body_20_39);
1018 $code.=<<___;
1019 add 0($ctx),$A # update context
1020 add 4($ctx),@T[0]
1021 add 8($ctx),$C
1022 add 12($ctx),$D
1023 mov $A,0($ctx)
1024 add 16($ctx),$E
1025 mov @T[0],4($ctx)
1026 mov @T[0],$B # magic seed
1027 mov $C,8($ctx)
1028 mov $D,12($ctx)
1029 mov $E,16($ctx)
1030 jmp .Loop_avx
1032 .align 16
1033 .Ldone_avx:
1035 $j=$saved_j; @V=@saved_V;
1037 &Xtail_avx(\&body_20_39);
1038 &Xtail_avx(\&body_20_39);
1039 &Xtail_avx(\&body_20_39);
1041 $code.=<<___;
1042 vzeroupper
1044 add 0($ctx),$A # update context
1045 add 4($ctx),@T[0]
1046 add 8($ctx),$C
1047 mov $A,0($ctx)
1048 add 12($ctx),$D
1049 mov @T[0],4($ctx)
1050 add 16($ctx),$E
1051 mov $C,8($ctx)
1052 mov $D,12($ctx)
1053 mov $E,16($ctx)
1055 $code.=<<___ if ($win64);
1056 movaps 64+0(%rsp),%xmm6
1057 movaps 64+16(%rsp),%xmm7
1058 movaps 64+32(%rsp),%xmm8
1059 movaps 64+48(%rsp),%xmm9
1060 movaps 64+64(%rsp),%xmm10
1062 $code.=<<___;
1063 lea `64+($win64?5*16:0)`(%rsp),%rsi
1064 mov 0(%rsi),%r12
1065 mov 8(%rsi),%rbp
1066 mov 16(%rsi),%rbx
1067 lea 24(%rsi),%rsp
1068 .Lepilogue_avx:
1070 .size sha1_block_data_order_avx,.-sha1_block_data_order_avx
1073 $code.=<<___;
1074 .align 64
1075 K_XX_XX:
1076 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1077 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1078 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1079 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1080 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
1083 $code.=<<___;
1084 .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1085 .align 64
1088 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1089 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1090 if ($win64) {
1091 $rec="%rcx";
1092 $frame="%rdx";
1093 $context="%r8";
1094 $disp="%r9";
1096 $code.=<<___;
1097 .extern __imp_RtlVirtualUnwind
1098 .type se_handler,\@abi-omnipotent
1099 .align 16
1100 se_handler:
1101 push %rsi
1102 push %rdi
1103 push %rbx
1104 push %rbp
1105 push %r12
1106 push %r13
1107 push %r14
1108 push %r15
1109 pushfq
1110 sub \$64,%rsp
1112 mov 120($context),%rax # pull context->Rax
1113 mov 248($context),%rbx # pull context->Rip
1115 lea .Lprologue(%rip),%r10
1116 cmp %r10,%rbx # context->Rip<.Lprologue
1117 jb .Lcommon_seh_tail
1119 mov 152($context),%rax # pull context->Rsp
1121 lea .Lepilogue(%rip),%r10
1122 cmp %r10,%rbx # context->Rip>=.Lepilogue
1123 jae .Lcommon_seh_tail
1125 mov `16*4`(%rax),%rax # pull saved stack pointer
1126 lea 32(%rax),%rax
1128 mov -8(%rax),%rbx
1129 mov -16(%rax),%rbp
1130 mov -24(%rax),%r12
1131 mov -32(%rax),%r13
1132 mov %rbx,144($context) # restore context->Rbx
1133 mov %rbp,160($context) # restore context->Rbp
1134 mov %r12,216($context) # restore context->R12
1135 mov %r13,224($context) # restore context->R13
1137 jmp .Lcommon_seh_tail
1138 .size se_handler,.-se_handler
1140 .type ssse3_handler,\@abi-omnipotent
1141 .align 16
1142 ssse3_handler:
1143 push %rsi
1144 push %rdi
1145 push %rbx
1146 push %rbp
1147 push %r12
1148 push %r13
1149 push %r14
1150 push %r15
1151 pushfq
1152 sub \$64,%rsp
1154 mov 120($context),%rax # pull context->Rax
1155 mov 248($context),%rbx # pull context->Rip
1157 mov 8($disp),%rsi # disp->ImageBase
1158 mov 56($disp),%r11 # disp->HandlerData
1160 mov 0(%r11),%r10d # HandlerData[0]
1161 lea (%rsi,%r10),%r10 # prologue label
1162 cmp %r10,%rbx # context->Rip<prologue label
1163 jb .Lcommon_seh_tail
1165 mov 152($context),%rax # pull context->Rsp
1167 mov 4(%r11),%r10d # HandlerData[1]
1168 lea (%rsi,%r10),%r10 # epilogue label
1169 cmp %r10,%rbx # context->Rip>=epilogue label
1170 jae .Lcommon_seh_tail
1172 lea 64(%rax),%rsi
1173 lea 512($context),%rdi # &context.Xmm6
1174 mov \$10,%ecx
1175 .long 0xa548f3fc # cld; rep movsq
1176 lea `24+64+5*16`(%rax),%rax # adjust stack pointer
1178 mov -8(%rax),%rbx
1179 mov -16(%rax),%rbp
1180 mov -24(%rax),%r12
1181 mov %rbx,144($context) # restore context->Rbx
1182 mov %rbp,160($context) # restore context->Rbp
1183 mov %r12,216($context) # restore cotnext->R12
1185 .Lcommon_seh_tail:
1186 mov 8(%rax),%rdi
1187 mov 16(%rax),%rsi
1188 mov %rax,152($context) # restore context->Rsp
1189 mov %rsi,168($context) # restore context->Rsi
1190 mov %rdi,176($context) # restore context->Rdi
1192 mov 40($disp),%rdi # disp->ContextRecord
1193 mov $context,%rsi # context
1194 mov \$154,%ecx # sizeof(CONTEXT)
1195 .long 0xa548f3fc # cld; rep movsq
1197 mov $disp,%rsi
1198 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1199 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1200 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1201 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1202 mov 40(%rsi),%r10 # disp->ContextRecord
1203 lea 56(%rsi),%r11 # &disp->HandlerData
1204 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1205 mov %r10,32(%rsp) # arg5
1206 mov %r11,40(%rsp) # arg6
1207 mov %r12,48(%rsp) # arg7
1208 mov %rcx,56(%rsp) # arg8, (NULL)
1209 call *__imp_RtlVirtualUnwind(%rip)
1211 mov \$1,%eax # ExceptionContinueSearch
1212 add \$64,%rsp
1213 popfq
1214 pop %r15
1215 pop %r14
1216 pop %r13
1217 pop %r12
1218 pop %rbp
1219 pop %rbx
1220 pop %rdi
1221 pop %rsi
1223 .size ssse3_handler,.-ssse3_handler
1225 .section .pdata
1226 .align 4
1227 .rva .LSEH_begin_sha1_block_data_order
1228 .rva .LSEH_end_sha1_block_data_order
1229 .rva .LSEH_info_sha1_block_data_order
1230 .rva .LSEH_begin_sha1_block_data_order_ssse3
1231 .rva .LSEH_end_sha1_block_data_order_ssse3
1232 .rva .LSEH_info_sha1_block_data_order_ssse3
1234 $code.=<<___ if ($avx);
1235 .rva .LSEH_begin_sha1_block_data_order_avx
1236 .rva .LSEH_end_sha1_block_data_order_avx
1237 .rva .LSEH_info_sha1_block_data_order_avx
1239 $code.=<<___;
1240 .section .xdata
1241 .align 8
1242 .LSEH_info_sha1_block_data_order:
1243 .byte 9,0,0,0
1244 .rva se_handler
1245 .LSEH_info_sha1_block_data_order_ssse3:
1246 .byte 9,0,0,0
1247 .rva ssse3_handler
1248 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
1250 $code.=<<___ if ($avx);
1251 .LSEH_info_sha1_block_data_order_avx:
1252 .byte 9,0,0,0
1253 .rva ssse3_handler
1254 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1258 ####################################################################
1260 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1261 print $code;
1262 close STDOUT;