3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
24 # AES-128/-192/-256+SHA256 this(**)gain
25 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
26 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
27 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
28 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
30 # (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
31 # Westmere is omitted from loop, this is because gain was not
32 # estimated high enough to justify the effort;
33 # (**) these are EVP-free results, results obtained with 'speed
34 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
42 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
43 ( $xlate="${dir}x86_64-xlate.pl" and -f
$xlate ) or
44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f
$xlate) or
45 die "can't locate x86_64-xlate.pl";
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49 $avx = ($1>=2.19) + ($1>=2.22);
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM
} =~ /nasm/) &&
53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.09) + ($1>=2.10);
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM
} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59 $avx = ($1>=10) + ($1>=12);
62 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
63 $avx = ($2>=3.0) + ($2>3.0);
66 $shaext=$avx; ### set to zero if compiling for 1.0.1
67 $avx=1 if (!$shaext && $avx);
69 open OUT
,"| \"$^X\" $xlate $flavour $output";
72 $func="aesni_cbc_sha256_enc";
75 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
76 "%r8d","%r9d","%r10d","%r11d");
77 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
84 ########################################################################
85 # void aesni_cbc_sha256_enc(const void *inp,
92 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
93 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
97 $_inp="16*$SZ+0*8(%rsp)";
98 $_out="16*$SZ+1*8(%rsp)";
99 $_end="16*$SZ+2*8(%rsp)";
100 $_key="16*$SZ+3*8(%rsp)";
101 $_ivp="16*$SZ+4*8(%rsp)";
102 $_ctx="16*$SZ+5*8(%rsp)";
103 $_in0="16*$SZ+6*8(%rsp)";
104 $_rsp="16*$SZ+7*8(%rsp)";
110 .extern OPENSSL_ia32cap_P
112 .type
$func,\
@abi-omnipotent
118 lea OPENSSL_ia32cap_P
(%rip),%r11
120 cmp \
$0,`$win64?"%rcx":"%rdi"`
125 $code.=<<___
if ($shaext);
126 bt \
$61,%r10 # check for SHA
133 test \
$`1<<11`,%r10d # check for XOP
136 $code.=<<___
if ($avx>1);
137 and \
$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
138 cmp \
$`1<<8|1<<5|1<<3`,%r11d
142 and \
$`1<<28`,%r10d # check for AVX
149 cmp \
$0,`$win64?"%rcx":"%rdi"`
157 .type
$TABLE,\
@object
159 .long
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
160 .long
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
161 .long
0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
162 .long
0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
163 .long
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
164 .long
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
165 .long
0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
166 .long
0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
167 .long
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
168 .long
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
169 .long
0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
170 .long
0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
171 .long
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
172 .long
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
173 .long
0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
174 .long
0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
175 .long
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
176 .long
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
177 .long
0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
178 .long
0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
179 .long
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
180 .long
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
181 .long
0xd192e819,0xd6990624,0xf40e3585,0x106aa070
182 .long
0xd192e819,0xd6990624,0xf40e3585,0x106aa070
183 .long
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
184 .long
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
185 .long
0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
186 .long
0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
187 .long
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
188 .long
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
189 .long
0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
190 .long
0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
192 .long
0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
193 .long
0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
194 .long
0,0,0,0, 0,0,0,0, -1,-1,-1,-1
195 .long
0,0,0,0, 0,0,0,0
196 .asciz
"AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
200 ######################################################################
204 ($iv,$inout,$roundkey,$temp,
205 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
209 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
210 ## &vmovdqu ($inout,($inp));
211 ## &mov ($_inp,$inp);
213 '&vpxor ($inout,$inout,$roundkey);'.
214 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
216 '&vpxor ($inout,$inout,$iv);',
218 '&vaesenc ($inout,$inout,$roundkey);'.
219 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
221 '&vaesenc ($inout,$inout,$roundkey);'.
222 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
224 '&vaesenc ($inout,$inout,$roundkey);'.
225 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
227 '&vaesenc ($inout,$inout,$roundkey);'.
228 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
230 '&vaesenc ($inout,$inout,$roundkey);'.
231 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
233 '&vaesenc ($inout,$inout,$roundkey);'.
234 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
236 '&vaesenc ($inout,$inout,$roundkey);'.
237 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
239 '&vaesenc ($inout,$inout,$roundkey);'.
240 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
242 '&vaesenc ($inout,$inout,$roundkey);'.
243 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
245 '&vaesenclast ($temp,$inout,$roundkey);'.
246 ' &vaesenc ($inout,$inout,$roundkey);'.
247 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
249 '&vpand ($iv,$temp,$mask10);'.
250 ' &vaesenc ($inout,$inout,$roundkey);'.
251 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
253 '&vaesenclast ($temp,$inout,$roundkey);'.
254 ' &vaesenc ($inout,$inout,$roundkey);'.
255 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
257 '&vpand ($temp,$temp,$mask12);'.
258 ' &vaesenc ($inout,$inout,$roundkey);'.
259 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
261 '&vpor ($iv,$iv,$temp);'.
262 ' &vaesenclast ($temp,$inout,$roundkey);'.
263 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
265 ## &mov ($inp,$_inp);
266 ## &mov ($out,$_out);
267 ## &vpand ($temp,$temp,$mask14);
268 ## &vpor ($iv,$iv,$temp);
269 ## &vmovdqu ($iv,($out,$inp);
270 ## &lea (inp,16($inp));
274 my ($a,$b,$c,$d,$e,$f,$g,$h);
276 sub AUTOLOAD
() # thunk [simplified] 32-bit style perlasm
277 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
279 $arg = "\$$arg" if ($arg*1 eq $arg);
280 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
285 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
287 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
292 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
293 '&xor ($a4,$g)', # f^g
295 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
297 '&and ($a4,$e)', # (f^g)&e
299 @aesni_cbc_block[$aesni_cbc_idx++].
301 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
304 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
305 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
306 '&xor ($a2,$b)', # a^b, b^c in next round
308 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
309 '&add ($h,$a4)', # h+=Ch(e,f,g)
310 '&and ($a3,$a2)', # (b^c)&(a^b)
313 '&add ($h,$a0)', # h+=Sigma1(e)
314 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
316 '&add ($d,$h)', # d+=h
317 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
318 '&add ($h,$a3)', # h+=Maj(a,b,c)
321 '&add ($a1,$h);'. # h+=Sigma0(a)
322 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
327 ######################################################################
331 .type
${func
}_xop
,\
@function,6
335 mov
`($win64?56:8)`(%rsp),$in0 # load 7th parameter
342 mov
%rsp,%r11 # copy %rsp
343 sub \
$`$framesz+$win64*16*10`,%rsp
344 and \
$-64,%rsp # align stack frame
347 sub $inp,$out # re-bias
349 add
$inp,$len # end of input
351 #mov $inp,$_inp # saved later
354 #mov $key,$_key # remains resident in $inp register
360 $code.=<<___
if ($win64);
361 movaps
%xmm6,`$framesz+16*0`(%rsp)
362 movaps
%xmm7,`$framesz+16*1`(%rsp)
363 movaps
%xmm8,`$framesz+16*2`(%rsp)
364 movaps
%xmm9,`$framesz+16*3`(%rsp)
365 movaps
%xmm10,`$framesz+16*4`(%rsp)
366 movaps
%xmm11,`$framesz+16*5`(%rsp)
367 movaps
%xmm12,`$framesz+16*6`(%rsp)
368 movaps
%xmm13,`$framesz+16*7`(%rsp)
369 movaps
%xmm14,`$framesz+16*8`(%rsp)
370 movaps
%xmm15,`$framesz+16*9`(%rsp)
376 mov
$inp,%r12 # borrow $a4
377 lea
0x80($key),$inp # size optimization, reassign
378 lea
$TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
379 mov
0xf0-0x80($inp),%r14d # rounds, borrow $a1
380 mov
$ctx,%r15 # borrow $a2
381 mov
$in0,%rsi # borrow $a3
382 vmovdqu
($ivp),$iv # load IV
394 vmovdqa
0x00(%r13,%r14,8),$mask14
395 vmovdqa
0x10(%r13,%r14,8),$mask12
396 vmovdqa
0x20(%r13,%r14,8),$mask10
397 vmovdqu
0x00-0x80($inp),$roundkey
400 if ($SZ==4) { # SHA256
401 my @X = map("%xmm$_",(0..3));
402 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
407 vmovdqa
$TABLE+`$SZ*2*$rounds`(%rip),$t3
408 vmovdqu
0x00(%rsi,%r12),@X[0]
409 vmovdqu
0x10(%rsi,%r12),@X[1]
410 vmovdqu
0x20(%rsi,%r12),@X[2]
411 vmovdqu
0x30(%rsi,%r12),@X[3]
412 vpshufb
$t3,@X[0],@X[0]
413 lea
$TABLE(%rip),$Tbl
414 vpshufb
$t3,@X[1],@X[1]
415 vpshufb
$t3,@X[2],@X[2]
416 vpaddd
0x00($Tbl),@X[0],$t0
417 vpshufb
$t3,@X[3],@X[3]
418 vpaddd
0x20($Tbl),@X[1],$t1
419 vpaddd
0x40($Tbl),@X[2],$t2
420 vpaddd
0x60($Tbl),@X[3],$t3
421 vmovdqa
$t0,0x00(%rsp)
423 vmovdqa
$t1,0x10(%rsp)
425 vmovdqa
$t2,0x20(%rsp)
427 vmovdqa
$t3,0x30(%rsp)
433 sub \
$-16*2*$SZ,$Tbl # size optimization
434 vmovdqu
(%r12),$inout # $a4
437 sub XOP_256_00_47
() {
441 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
443 &vpalignr
($t0,@X[1],@X[0],$SZ); # X[1..4]
446 &vpalignr
($t3,@X[3],@X[2],$SZ); # X[9..12]
449 &vprotd
($t1,$t0,8*$SZ-$sigma0[1]);
452 &vpsrld
($t0,$t0,$sigma0[2]);
455 &vpaddd
(@X[0],@X[0],$t3); # X[0..3] += X[9..12]
460 &vprotd
($t2,$t1,$sigma0[1]-$sigma0[0]);
463 &vpxor
($t0,$t0,$t1);
468 &vprotd
($t3,@X[3],8*$SZ-$sigma1[1]);
471 &vpxor
($t0,$t0,$t2); # sigma0(X[1..4])
474 &vpsrld
($t2,@X[3],$sigma1[2]);
477 &vpaddd
(@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
480 &vprotd
($t1,$t3,$sigma1[1]-$sigma1[0]);
483 &vpxor
($t3,$t3,$t2);
488 &vpxor
($t3,$t3,$t1); # sigma1(X[14..15])
493 &vpsrldq
($t3,$t3,8);
498 &vpaddd
(@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
503 &vprotd
($t3,@X[0],8*$SZ-$sigma1[1]);
506 &vpsrld
($t2,@X[0],$sigma1[2]);
509 &vprotd
($t1,$t3,$sigma1[1]-$sigma1[0]);
512 &vpxor
($t3,$t3,$t2);
517 &vpxor
($t3,$t3,$t1); # sigma1(X[16..17])
522 &vpslldq
($t3,$t3,8); # 22 instructions
527 &vpaddd
(@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
532 &vpaddd
($t2,@X[0],16*2*$j."($Tbl)");
533 foreach (@insns) { eval; } # remaining instructions
534 &vmovdqa
(16*$j."(%rsp)",$t2);
538 for ($i=0,$j=0; $j<4; $j++) {
539 &XOP_256_00_47
($j,\
&body_00_15
,@X);
540 push(@X,shift(@X)); # rotate(@X)
542 &mov
("%r12",$_inp); # borrow $a4
543 &vpand
($temp,$temp,$mask14);
544 &mov
("%r15",$_out); # borrow $a2
545 &vpor
($iv,$iv,$temp);
546 &vmovdqu
("(%r15,%r12)",$iv); # write output
547 &lea
("%r12","16(%r12)"); # inp++
549 &cmpb
($SZ-1+16*2*$SZ."($Tbl)",0);
550 &jne
(".Lxop_00_47");
552 &vmovdqu
($inout,"(%r12)");
556 for ($i=0; $i<16; ) {
557 foreach(body_00_15
()) { eval; }
561 mov
$_inp,%r12 # borrow $a4
562 mov
$_out,%r13 # borrow $a0
563 mov
$_ctx,%r15 # borrow $a2
564 mov
$_in0,%rsi # borrow $a3
566 vpand
$mask14,$temp,$temp
569 vmovdqu
$iv,(%r13,%r12) # write output
570 lea
16(%r12),%r12 # inp++
596 vmovdqu
$iv,($ivp) # output IV
599 $code.=<<___
if ($win64);
600 movaps
`$framesz+16*0`(%rsp),%xmm6
601 movaps
`$framesz+16*1`(%rsp),%xmm7
602 movaps
`$framesz+16*2`(%rsp),%xmm8
603 movaps
`$framesz+16*3`(%rsp),%xmm9
604 movaps
`$framesz+16*4`(%rsp),%xmm10
605 movaps
`$framesz+16*5`(%rsp),%xmm11
606 movaps
`$framesz+16*6`(%rsp),%xmm12
607 movaps
`$framesz+16*7`(%rsp),%xmm13
608 movaps
`$framesz+16*8`(%rsp),%xmm14
609 movaps
`$framesz+16*9`(%rsp),%xmm15
621 .size
${func
}_xop
,.-${func
}_xop
623 ######################################################################
626 local *ror
= sub { &shrd
(@_[0],@_) };
629 .type
${func
}_avx
,\
@function,6
633 mov
`($win64?56:8)`(%rsp),$in0 # load 7th parameter
640 mov
%rsp,%r11 # copy %rsp
641 sub \
$`$framesz+$win64*16*10`,%rsp
642 and \
$-64,%rsp # align stack frame
645 sub $inp,$out # re-bias
647 add
$inp,$len # end of input
649 #mov $inp,$_inp # saved later
652 #mov $key,$_key # remains resident in $inp register
658 $code.=<<___
if ($win64);
659 movaps
%xmm6,`$framesz+16*0`(%rsp)
660 movaps
%xmm7,`$framesz+16*1`(%rsp)
661 movaps
%xmm8,`$framesz+16*2`(%rsp)
662 movaps
%xmm9,`$framesz+16*3`(%rsp)
663 movaps
%xmm10,`$framesz+16*4`(%rsp)
664 movaps
%xmm11,`$framesz+16*5`(%rsp)
665 movaps
%xmm12,`$framesz+16*6`(%rsp)
666 movaps
%xmm13,`$framesz+16*7`(%rsp)
667 movaps
%xmm14,`$framesz+16*8`(%rsp)
668 movaps
%xmm15,`$framesz+16*9`(%rsp)
674 mov
$inp,%r12 # borrow $a4
675 lea
0x80($key),$inp # size optimization, reassign
676 lea
$TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
677 mov
0xf0-0x80($inp),%r14d # rounds, borrow $a1
678 mov
$ctx,%r15 # borrow $a2
679 mov
$in0,%rsi # borrow $a3
680 vmovdqu
($ivp),$iv # load IV
692 vmovdqa
0x00(%r13,%r14,8),$mask14
693 vmovdqa
0x10(%r13,%r14,8),$mask12
694 vmovdqa
0x20(%r13,%r14,8),$mask10
695 vmovdqu
0x00-0x80($inp),$roundkey
697 if ($SZ==4) { # SHA256
698 my @X = map("%xmm$_",(0..3));
699 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
705 vmovdqa
$TABLE+`$SZ*2*$rounds`(%rip),$t3
706 vmovdqu
0x00(%rsi,%r12),@X[0]
707 vmovdqu
0x10(%rsi,%r12),@X[1]
708 vmovdqu
0x20(%rsi,%r12),@X[2]
709 vmovdqu
0x30(%rsi,%r12),@X[3]
710 vpshufb
$t3,@X[0],@X[0]
711 lea
$TABLE(%rip),$Tbl
712 vpshufb
$t3,@X[1],@X[1]
713 vpshufb
$t3,@X[2],@X[2]
714 vpaddd
0x00($Tbl),@X[0],$t0
715 vpshufb
$t3,@X[3],@X[3]
716 vpaddd
0x20($Tbl),@X[1],$t1
717 vpaddd
0x40($Tbl),@X[2],$t2
718 vpaddd
0x60($Tbl),@X[3],$t3
719 vmovdqa
$t0,0x00(%rsp)
721 vmovdqa
$t1,0x10(%rsp)
723 vmovdqa
$t2,0x20(%rsp)
725 vmovdqa
$t3,0x30(%rsp)
731 sub \
$-16*2*$SZ,$Tbl # size optimization
732 vmovdqu
(%r12),$inout # $a4
735 sub Xupdate_256_AVX
() {
737 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
738 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
739 '&vpsrld ($t2,$t0,$sigma0[0]);',
740 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
741 '&vpsrld ($t3,$t0,$sigma0[2])',
742 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
743 '&vpxor ($t0,$t3,$t2)',
744 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
745 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
746 '&vpxor ($t0,$t0,$t1)',
747 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
748 '&vpxor ($t0,$t0,$t2)',
749 '&vpsrld ($t2,$t3,$sigma1[2]);',
750 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
751 '&vpsrlq ($t3,$t3,$sigma1[0]);',
752 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
753 '&vpxor ($t2,$t2,$t3);',
754 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
755 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
756 '&vpshufd ($t2,$t2,0b10000100)',
757 '&vpsrldq ($t2,$t2,8)',
758 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
759 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
760 '&vpsrld ($t2,$t3,$sigma1[2])',
761 '&vpsrlq ($t3,$t3,$sigma1[0])',
762 '&vpxor ($t2,$t2,$t3);',
763 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
764 '&vpxor ($t2,$t2,$t3)',
765 '&vpshufd ($t2,$t2,0b11101000)',
766 '&vpslldq ($t2,$t2,8)',
767 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
771 sub AVX_256_00_47
() {
775 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
777 foreach (Xupdate_256_AVX
()) { # 29 instructions
783 &vpaddd
($t2,@X[0],16*2*$j."($Tbl)");
784 foreach (@insns) { eval; } # remaining instructions
785 &vmovdqa
(16*$j."(%rsp)",$t2);
789 for ($i=0,$j=0; $j<4; $j++) {
790 &AVX_256_00_47
($j,\
&body_00_15
,@X);
791 push(@X,shift(@X)); # rotate(@X)
793 &mov
("%r12",$_inp); # borrow $a4
794 &vpand
($temp,$temp,$mask14);
795 &mov
("%r15",$_out); # borrow $a2
796 &vpor
($iv,$iv,$temp);
797 &vmovdqu
("(%r15,%r12)",$iv); # write output
798 &lea
("%r12","16(%r12)"); # inp++
800 &cmpb
($SZ-1+16*2*$SZ."($Tbl)",0);
801 &jne
(".Lavx_00_47");
803 &vmovdqu
($inout,"(%r12)");
807 for ($i=0; $i<16; ) {
808 foreach(body_00_15
()) { eval; }
813 mov
$_inp,%r12 # borrow $a4
814 mov
$_out,%r13 # borrow $a0
815 mov
$_ctx,%r15 # borrow $a2
816 mov
$_in0,%rsi # borrow $a3
818 vpand
$mask14,$temp,$temp
821 vmovdqu
$iv,(%r13,%r12) # write output
822 lea
16(%r12),%r12 # inp++
847 vmovdqu
$iv,($ivp) # output IV
850 $code.=<<___
if ($win64);
851 movaps
`$framesz+16*0`(%rsp),%xmm6
852 movaps
`$framesz+16*1`(%rsp),%xmm7
853 movaps
`$framesz+16*2`(%rsp),%xmm8
854 movaps
`$framesz+16*3`(%rsp),%xmm9
855 movaps
`$framesz+16*4`(%rsp),%xmm10
856 movaps
`$framesz+16*5`(%rsp),%xmm11
857 movaps
`$framesz+16*6`(%rsp),%xmm12
858 movaps
`$framesz+16*7`(%rsp),%xmm13
859 movaps
`$framesz+16*8`(%rsp),%xmm14
860 movaps
`$framesz+16*9`(%rsp),%xmm15
872 .size
${func
}_avx
,.-${func
}_avx
876 ######################################################################
879 my $a5=$SZ==4?
"%esi":"%rsi"; # zap $inp
884 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
886 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
888 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
889 '&and ($a4,$e)', # f&e
890 '&rorx ($a0,$e,$Sigma1[2])',
891 '&rorx ($a2,$e,$Sigma1[1])',
893 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
894 '&lea ($h,"($h,$a4)")',
895 '&andn ($a4,$e,$g)', # ~e&g
898 '&rorx ($a1,$e,$Sigma1[0])',
899 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
900 '&xor ($a0,$a1)', # Sigma1(e)
903 '&rorx ($a4,$a,$Sigma0[2])',
904 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
905 '&xor ($a2,$b)', # a^b, b^c in next round
906 '&rorx ($a1,$a,$Sigma0[1])',
908 '&rorx ($a0,$a,$Sigma0[0])',
909 '&lea ($d,"($d,$h)")', # d+=h
910 '&and ($a3,$a2)', # (b^c)&(a^b)
911 @aesni_cbc_block[$aesni_cbc_idx++].
914 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
915 '&xor ($a1,$a0)', # Sigma0(a)
916 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
917 '&mov ($a4,$e)', # copy of f in future
919 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
921 # and at the finish one has to $a+=$a1
925 .type
${func
}_avx2
,\
@function,6
929 mov
`($win64?56:8)`(%rsp),$in0 # load 7th parameter
936 mov
%rsp,%r11 # copy %rsp
937 sub \
$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
938 and \
$-256*$SZ,%rsp # align stack frame
939 add \
$`2*$SZ*($rounds-8)`,%rsp
942 sub $inp,$out # re-bias
944 add
$inp,$len # end of input
946 #mov $inp,$_inp # saved later
947 #mov $out,$_out # kept in $offload
949 #mov $key,$_key # remains resident in $inp register
955 $code.=<<___
if ($win64);
956 movaps
%xmm6,`$framesz+16*0`(%rsp)
957 movaps
%xmm7,`$framesz+16*1`(%rsp)
958 movaps
%xmm8,`$framesz+16*2`(%rsp)
959 movaps
%xmm9,`$framesz+16*3`(%rsp)
960 movaps
%xmm10,`$framesz+16*4`(%rsp)
961 movaps
%xmm11,`$framesz+16*5`(%rsp)
962 movaps
%xmm12,`$framesz+16*6`(%rsp)
963 movaps
%xmm13,`$framesz+16*7`(%rsp)
964 movaps
%xmm14,`$framesz+16*8`(%rsp)
965 movaps
%xmm15,`$framesz+16*9`(%rsp)
971 mov
$inp,%r13 # borrow $a0
972 vpinsrq \
$1,$out,$offload,$offload
973 lea
0x80($key),$inp # size optimization, reassign
974 lea
$TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
975 mov
0xf0-0x80($inp),%r14d # rounds, borrow $a1
976 mov
$ctx,%r15 # borrow $a2
977 mov
$in0,%rsi # borrow $a3
978 vmovdqu
($ivp),$iv # load IV
981 vmovdqa
0x00(%r12,%r14,8),$mask14
982 vmovdqa
0x10(%r12,%r14,8),$mask12
983 vmovdqa
0x20(%r12,%r14,8),$mask10
985 sub \
$-16*$SZ,%r13 # inp++, size optimization
987 lea
(%rsi,%r13),%r12 # borrow $a0
989 cmp $len,%r13 # $_end
991 cmove
%rsp,%r12 # next block or random data
997 vmovdqu
0x00-0x80($inp),$roundkey
999 if ($SZ==4) { # SHA256
1000 my @X = map("%ymm$_",(0..3));
1001 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1007 vmovdqa
$TABLE+`$SZ*2*$rounds`(%rip),$t3
1008 vmovdqu
-16*$SZ+0(%rsi,%r13),%xmm0
1009 vmovdqu
-16*$SZ+16(%rsi,%r13),%xmm1
1010 vmovdqu
-16*$SZ+32(%rsi,%r13),%xmm2
1011 vmovdqu
-16*$SZ+48(%rsi,%r13),%xmm3
1013 vinserti128 \
$1,(%r12),@X[0],@X[0]
1014 vinserti128 \
$1,16(%r12),@X[1],@X[1]
1015 vpshufb
$t3,@X[0],@X[0]
1016 vinserti128 \
$1,32(%r12),@X[2],@X[2]
1017 vpshufb
$t3,@X[1],@X[1]
1018 vinserti128 \
$1,48(%r12),@X[3],@X[3]
1020 lea
$TABLE(%rip),$Tbl
1021 vpshufb
$t3,@X[2],@X[2]
1022 lea
-16*$SZ(%r13),%r13
1023 vpaddd
0x00($Tbl),@X[0],$t0
1024 vpshufb
$t3,@X[3],@X[3]
1025 vpaddd
0x20($Tbl),@X[1],$t1
1026 vpaddd
0x40($Tbl),@X[2],$t2
1027 vpaddd
0x60($Tbl),@X[3],$t3
1028 vmovdqa
$t0,0x00(%rsp)
1030 vmovdqa
$t1,0x20(%rsp)
1031 lea
-$PUSH8(%rsp),%rsp
1033 vmovdqa
$t2,0x00(%rsp)
1035 vmovdqa
$t3,0x20(%rsp)
1037 sub \
$-16*2*$SZ,$Tbl # size optimization
1042 vmovdqu
(%r13),$inout
1043 vpinsrq \
$0,%r13,$offload,$offload
1046 sub AVX2_256_00_47
() {
1050 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1051 my $base = "+2*$PUSH8(%rsp)";
1053 &lea
("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1054 foreach (Xupdate_256_AVX
()) { # 29 instructions
1056 eval(shift(@insns));
1057 eval(shift(@insns));
1058 eval(shift(@insns));
1060 &vpaddd
($t2,@X[0],16*2*$j."($Tbl)");
1061 foreach (@insns) { eval; } # remaining instructions
1062 &vmovdqa
((32*$j)%$PUSH8."(%rsp)",$t2);
1065 for ($i=0,$j=0; $j<4; $j++) {
1066 &AVX2_256_00_47
($j,\
&bodyx_00_15
,@X);
1067 push(@X,shift(@X)); # rotate(@X)
1069 &vmovq
("%r13",$offload); # borrow $a0
1070 &vpextrq
("%r15",$offload,1); # borrow $a2
1071 &vpand
($temp,$temp,$mask14);
1072 &vpor
($iv,$iv,$temp);
1073 &vmovdqu
("(%r15,%r13)",$iv); # write output
1074 &lea
("%r13","16(%r13)"); # inp++
1076 &lea
($Tbl,16*2*$SZ."($Tbl)");
1077 &cmpb
(($SZ-1)."($Tbl)",0);
1078 &jne
(".Lavx2_00_47");
1080 &vmovdqu
($inout,"(%r13)");
1081 &vpinsrq
($offload,$offload,"%r13",0);
1084 for ($i=0; $i<16; ) {
1085 my $base=$i<8?
"+$PUSH8(%rsp)":"(%rsp)";
1086 foreach(bodyx_00_15
()) { eval; }
1090 vpextrq \
$1,$offload,%r12 # $_out, borrow $a4
1091 vmovq
$offload,%r13 # $_inp, borrow $a0
1092 mov
`2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1094 lea
`2*$SZ*($rounds-8)`(%rsp),$Tbl
1096 vpand
$mask14,$temp,$temp
1098 vmovdqu
$iv,(%r12,%r13) # write output
1119 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1129 vmovdqu
(%r13),$inout
1130 vpinsrq \
$0,%r13,$offload,$offload
1133 for ($i=0; $i<16; ) {
1134 my $base="+16($Tbl)";
1135 foreach(bodyx_00_15
()) { eval; }
1136 &lea
($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1139 vmovq
$offload,%r13 # borrow $a0
1140 vpextrq \
$1,$offload,%r15 # borrow $a2
1141 vpand
$mask14,$temp,$temp
1143 lea
-$PUSH8($Tbl),$Tbl
1144 vmovdqu
$iv,(%r15,%r13) # write output
1145 lea
16(%r13),%r13 # inp++
1149 mov
`2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1150 lea
16*$SZ(%r13),%r13
1151 mov
`2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1153 lea
`2*$SZ*($rounds-8)`(%rsp),%rsp
1162 lea
(%rsi,%r13),%r12
1168 cmove
%rsp,%r12 # next block or stale data
1184 vmovdqu
$iv,($ivp) # output IV
1187 $code.=<<___
if ($win64);
1188 movaps
`$framesz+16*0`(%rsp),%xmm6
1189 movaps
`$framesz+16*1`(%rsp),%xmm7
1190 movaps
`$framesz+16*2`(%rsp),%xmm8
1191 movaps
`$framesz+16*3`(%rsp),%xmm9
1192 movaps
`$framesz+16*4`(%rsp),%xmm10
1193 movaps
`$framesz+16*5`(%rsp),%xmm11
1194 movaps
`$framesz+16*6`(%rsp),%xmm12
1195 movaps
`$framesz+16*7`(%rsp),%xmm13
1196 movaps
`$framesz+16*8`(%rsp),%xmm14
1197 movaps
`$framesz+16*9`(%rsp),%xmm15
1209 .size
${func
}_avx2
,.-${func
}_avx2
1214 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1216 my ($rounds,$Tbl)=("%r11d","%rbx");
1218 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1219 my @rndkey=("%xmm4","%xmm5");
1223 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1224 my @MSG=map("%xmm$_",(10..13));
1228 my ($n,$k)=($r/10,$r%10);
1231 movups
`16*$n`($in0),$in # load input
1234 $code.=<<___
if ($n);
1235 movups
$iv,`16*($n-1)`($out,$in0) # write output
1239 movups
`32+16*$k-112`($key),$rndkey[1]
1240 aesenc
$rndkey[0],$iv
1247 movups
`32+16*($k+0)-112`($key),$rndkey[1]
1248 aesenc
$rndkey[0],$iv
1249 movups
`32+16*($k+1)-112`($key),$rndkey[0]
1250 aesenc
$rndkey[1],$iv
1252 movups
`32+16*($k+2)-112`($key),$rndkey[1]
1253 aesenc
$rndkey[0],$iv
1254 movups
`32+16*($k+3)-112`($key),$rndkey[0]
1255 aesenc
$rndkey[1],$iv
1257 aesenclast
$rndkey[0],$iv
1258 movups
16-112($key),$rndkey[1] # forward reference
1263 movups
`32+16*$k-112`($key),$rndkey[1]
1264 aesenc
$rndkey[0],$iv
1267 $r++; unshift(@rndkey,pop(@rndkey));
1274 .type
${func
}_shaext
,\
@function,6
1277 mov
`($win64?56:8)`(%rsp),$inp # load 7th argument
1279 $code.=<<___
if ($win64);
1280 lea
`-8-10*16`(%rsp),%rsp
1281 movaps
%xmm6,-8-10*16(%rax)
1282 movaps
%xmm7,-8-9*16(%rax)
1283 movaps
%xmm8,-8-8*16(%rax)
1284 movaps
%xmm9,-8-7*16(%rax)
1285 movaps
%xmm10,-8-6*16(%rax)
1286 movaps
%xmm11,-8-5*16(%rax)
1287 movaps
%xmm12,-8-4*16(%rax)
1288 movaps
%xmm13,-8-3*16(%rax)
1289 movaps
%xmm14,-8-2*16(%rax)
1290 movaps
%xmm15,-8-1*16(%rax)
1294 lea K256
+0x80(%rip),$Tbl
1295 movdqu
($ctx),$ABEF # DCBA
1296 movdqu
16($ctx),$CDGH # HGFE
1297 movdqa
0x200-0x80($Tbl),$TMP # byte swap mask
1299 mov
240($key),$rounds
1301 movups
($key),$rndkey0 # $key[0]
1302 movups
16($key),$rndkey[0] # forward reference
1303 lea
112($key),$key # size optimization
1305 pshufd \
$0x1b,$ABEF,$Wi # ABCD
1306 pshufd \
$0xb1,$ABEF,$ABEF # CDAB
1307 pshufd \
$0x1b,$CDGH,$CDGH # EFGH
1308 movdqa
$TMP,$BSWAP # offload
1309 palignr \
$8,$CDGH,$ABEF # ABEF
1310 punpcklqdq
$Wi,$CDGH # CDGH
1316 movdqu
($inp),@MSG[0]
1317 movdqu
0x10($inp),@MSG[1]
1318 movdqu
0x20($inp),@MSG[2]
1320 movdqu
0x30($inp),@MSG[3]
1322 movdqa
0*32-0x80($Tbl),$Wi
1325 movdqa
$CDGH,$CDGH_SAVE # offload
1326 movdqa
$ABEF,$ABEF_SAVE # offload
1330 sha256rnds2
$ABEF,$CDGH # 0-3
1331 pshufd \
$0x0e,$Wi,$Wi
1335 sha256rnds2
$CDGH,$ABEF
1337 movdqa
1*32-0x80($Tbl),$Wi
1344 sha256rnds2
$ABEF,$CDGH # 4-7
1345 pshufd \
$0x0e,$Wi,$Wi
1349 sha256rnds2
$CDGH,$ABEF
1351 movdqa
2*32-0x80($Tbl),$Wi
1354 sha256msg1
@MSG[1],@MSG[0]
1358 sha256rnds2
$ABEF,$CDGH # 8-11
1359 pshufd \
$0x0e,$Wi,$Wi
1361 palignr \
$4,@MSG[2],$TMP
1366 sha256rnds2
$CDGH,$ABEF
1368 movdqa
3*32-0x80($Tbl),$Wi
1370 sha256msg2
@MSG[3],@MSG[0]
1371 sha256msg1
@MSG[2],@MSG[1]
1375 sha256rnds2
$ABEF,$CDGH # 12-15
1376 pshufd \
$0x0e,$Wi,$Wi
1381 palignr \
$4,@MSG[3],$TMP
1383 sha256rnds2
$CDGH,$ABEF
1385 for($i=4;$i<16-3;$i++) {
1386 &$aesenc() if (($r%10)==0);
1388 movdqa
$i*32-0x80($Tbl),$Wi
1390 sha256msg2
@MSG[0],@MSG[1]
1391 sha256msg1
@MSG[3],@MSG[2]
1395 sha256rnds2
$ABEF,$CDGH # 16-19...
1396 pshufd \
$0x0e,$Wi,$Wi
1398 palignr \
$4,@MSG[0],$TMP
1402 &$aesenc() if ($r==19);
1404 sha256rnds2
$CDGH,$ABEF
1406 push(@MSG,shift(@MSG));
1409 movdqa
13*32-0x80($Tbl),$Wi
1411 sha256msg2
@MSG[0],@MSG[1]
1412 sha256msg1
@MSG[3],@MSG[2]
1416 sha256rnds2
$ABEF,$CDGH # 52-55
1417 pshufd \
$0x0e,$Wi,$Wi
1419 palignr \
$4,@MSG[0],$TMP
1425 sha256rnds2
$CDGH,$ABEF
1427 movdqa
14*32-0x80($Tbl),$Wi
1429 sha256msg2
@MSG[1],@MSG[2]
1434 sha256rnds2
$ABEF,$CDGH # 56-59
1435 pshufd \
$0x0e,$Wi,$Wi
1439 sha256rnds2
$CDGH,$ABEF
1441 movdqa
15*32-0x80($Tbl),$Wi
1447 sha256rnds2
$ABEF,$CDGH # 60-63
1448 pshufd \
$0x0e,$Wi,$Wi
1452 sha256rnds2
$CDGH,$ABEF
1453 #pxor $CDGH,$rndkey0 # black magic
1455 while ($r<40) { &$aesenc(); } # remaining aesenc's
1457 #xorps $CDGH,$rndkey0 # black magic
1458 paddd
$CDGH_SAVE,$CDGH
1459 paddd
$ABEF_SAVE,$ABEF
1462 movups
$iv,48($out,$in0) # write output
1466 pshufd \
$0xb1,$CDGH,$CDGH # DCHG
1467 pshufd \
$0x1b,$ABEF,$TMP # FEBA
1468 pshufd \
$0xb1,$ABEF,$ABEF # BAFE
1469 punpckhqdq
$CDGH,$ABEF # DCBA
1470 palignr \
$8,$TMP,$CDGH # HGFE
1472 movups
$iv,($ivp) # write IV
1474 movdqu
$CDGH,16($ctx)
1476 $code.=<<___
if ($win64);
1477 movaps
0*16(%rsp),%xmm6
1478 movaps
1*16(%rsp),%xmm7
1479 movaps
2*16(%rsp),%xmm8
1480 movaps
3*16(%rsp),%xmm9
1481 movaps
4*16(%rsp),%xmm10
1482 movaps
5*16(%rsp),%xmm11
1483 movaps
6*16(%rsp),%xmm12
1484 movaps
7*16(%rsp),%xmm13
1485 movaps
8*16(%rsp),%xmm14
1486 movaps
9*16(%rsp),%xmm15
1487 lea
8+10*16(%rsp),%rsp
1492 .size
${func
}_shaext
,.-${func
}_shaext
1497 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1498 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1499 if ($win64 && $avx) {
1506 .extern __imp_RtlVirtualUnwind
1507 .type se_handler
,\
@abi-omnipotent
1521 mov
120($context),%rax # pull context->Rax
1522 mov
248($context),%rbx # pull context->Rip
1524 mov
8($disp),%rsi # disp->ImageBase
1525 mov
56($disp),%r11 # disp->HanderlData
1527 mov
0(%r11),%r10d # HandlerData[0]
1528 lea
(%rsi,%r10),%r10 # prologue label
1529 cmp %r10,%rbx # context->Rip<prologue label
1532 mov
152($context),%rax # pull context->Rsp
1534 mov
4(%r11),%r10d # HandlerData[1]
1535 lea
(%rsi,%r10),%r10 # epilogue label
1536 cmp %r10,%rbx # context->Rip>=epilogue label
1539 $code.=<<___
if ($shaext);
1540 lea aesni_cbc_sha256_enc_shaext
(%rip),%r10
1545 lea
512($context),%rdi # &context.Xmm6
1547 .long
0xa548f3fc # cld; rep movsq
1548 lea
168(%rax),%rax # adjust stack pointer
1552 $code.=<<___
if ($avx>1);
1553 lea
.Lavx2_shortcut
(%rip),%r10
1554 cmp %r10,%rbx # context->Rip<avx2_shortcut
1558 add \
$`2*$SZ*($rounds-8)`,%rax
1562 mov
%rax,%rsi # put aside Rsp
1563 mov
16*$SZ+7*8(%rax),%rax # pull $_rsp
1572 mov
%rbx,144($context) # restore context->Rbx
1573 mov
%rbp,160($context) # restore context->Rbp
1574 mov
%r12,216($context) # restore context->R12
1575 mov
%r13,224($context) # restore context->R13
1576 mov
%r14,232($context) # restore context->R14
1577 mov
%r15,240($context) # restore context->R15
1579 lea
16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1580 lea
512($context),%rdi # &context.Xmm6
1582 .long
0xa548f3fc # cld; rep movsq
1587 mov
%rax,152($context) # restore context->Rsp
1588 mov
%rsi,168($context) # restore context->Rsi
1589 mov
%rdi,176($context) # restore context->Rdi
1591 mov
40($disp),%rdi # disp->ContextRecord
1592 mov
$context,%rsi # context
1593 mov \
$154,%ecx # sizeof(CONTEXT)
1594 .long
0xa548f3fc # cld; rep movsq
1597 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1598 mov
8(%rsi),%rdx # arg2, disp->ImageBase
1599 mov
0(%rsi),%r8 # arg3, disp->ControlPc
1600 mov
16(%rsi),%r9 # arg4, disp->FunctionEntry
1601 mov
40(%rsi),%r10 # disp->ContextRecord
1602 lea
56(%rsi),%r11 # &disp->HandlerData
1603 lea
24(%rsi),%r12 # &disp->EstablisherFrame
1604 mov
%r10,32(%rsp) # arg5
1605 mov
%r11,40(%rsp) # arg6
1606 mov
%r12,48(%rsp) # arg7
1607 mov
%rcx,56(%rsp) # arg8, (NULL)
1608 call
*__imp_RtlVirtualUnwind
(%rip)
1610 mov \
$1,%eax # ExceptionContinueSearch
1622 .size se_handler
,.-se_handler
1625 .rva
.LSEH_begin_
${func
}_xop
1626 .rva
.LSEH_end_
${func
}_xop
1627 .rva
.LSEH_info_
${func
}_xop
1629 .rva
.LSEH_begin_
${func
}_avx
1630 .rva
.LSEH_end_
${func
}_avx
1631 .rva
.LSEH_info_
${func
}_avx
1633 $code.=<<___
if ($avx>1);
1634 .rva
.LSEH_begin_
${func
}_avx2
1635 .rva
.LSEH_end_
${func
}_avx2
1636 .rva
.LSEH_info_
${func
}_avx2
1638 $code.=<<___
if ($shaext);
1639 .rva
.LSEH_begin_
${func
}_shaext
1640 .rva
.LSEH_end_
${func
}_shaext
1641 .rva
.LSEH_info_
${func
}_shaext
1646 .LSEH_info_
${func
}_xop
:
1649 .rva
.Lprologue_xop
,.Lepilogue_xop
# HandlerData[]
1651 .LSEH_info_
${func
}_avx
:
1654 .rva
.Lprologue_avx
,.Lepilogue_avx
# HandlerData[]
1656 $code.=<<___
if ($avx>1);
1657 .LSEH_info_
${func
}_avx2
:
1660 .rva
.Lprologue_avx2
,.Lepilogue_avx2
# HandlerData[]
1662 $code.=<<___
if ($shaext);
1663 .LSEH_info_
${func
}_shaext
:
1666 .rva
.Lprologue_shaext
,.Lepilogue_shaext
# HandlerData[]
1670 ####################################################################
1672 local *opcode
=shift;
1676 $rex|=0x04 if($dst>=8);
1677 $rex|=0x01 if($src>=8);
1678 unshift @opcode,$rex|0x40 if($rex);
1683 "sha256rnds2" => 0xcb,
1684 "sha256msg1" => 0xcc,
1685 "sha256msg2" => 0xcd );
1690 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1691 my @opcode=(0x0f,0x38);
1692 rex
(\
@opcode,$2,$1);
1693 push @opcode,$opcodelet{$instr};
1694 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1695 return ".byte\t".join(',',@opcode);
1697 return $instr."\t".@_[0];
1702 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1703 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;