release/src/router/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # January 2013
  11 #
  12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  15 # parallelism, interleaving it with another algorithm would allow to
  16 # utilize processor resources better and achieve better performance.
  17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  18 # AESNI code is weaved into it. As SHA256 dominates execution time,
  19 # stitch performance does not depend on AES key length. Below are
  20 # performance numbers in cycles per processed byte, less is better,
  21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  22 # subroutine:
  23 #
  24 #                AES-128/-192/-256+SHA256       this(**)gain
  25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
  26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
  27 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
  28 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
  29 #
  30 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
  31 #       Westmere is omitted from loop, this is because gain was not
  32 #       estimated high enough to justify the effort;
  33 # (**)  these are EVP-free results, results obtained with 'speed
  34 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  35
  36 $flavour = shift;
  37 $output  = shift;
  38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  39
  40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  41
  42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  45 die "can't locate x86_64-xlate.pl";
  46
  47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  48                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  49         $avx = ($1>=2.19) + ($1>=2.22);
  50 }
  51
  52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  53            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  54         $avx = ($1>=2.09) + ($1>=2.10);
  55 }
  56
  57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  58            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  59         $avx = ($1>=10) + ($1>=12);
  60 }
  61
  62 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
  63         $avx = ($2>=3.0) + ($2>3.0);
  64 }
  65
  66 $shaext=$avx;   ### set to zero if compiling for 1.0.1
  67 $avx=1          if (!$shaext && $avx);
  68
  69 open OUT,"| \"$^X\" $xlate $flavour $output";
  70 *STDOUT=*OUT;
  71
  72 $func="aesni_cbc_sha256_enc";
  73 $TABLE="K256";
  74 $SZ=4;
  75 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  76                                 "%r8d","%r9d","%r10d","%r11d");
  77 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  78 @Sigma0=( 2,13,22);
  79 @Sigma1=( 6,11,25);
  80 @sigma0=( 7,18, 3);
  81 @sigma1=(17,19,10);
  82 $rounds=64;
  83
  84 ########################################################################
  85 # void aesni_cbc_sha256_enc(const void *inp,
  86 #                       void *out,
  87 #                       size_t length,
  88 #                       const AES_KEY *key,
  89 #                       unsigned char *iv,
  90 #                       SHA256_CTX *ctx,
  91 #                       const void *in0);
  92 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
  93 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  94
  95 $Tbl="%rbp";
  96
  97 $_inp="16*$SZ+0*8(%rsp)";
  98 $_out="16*$SZ+1*8(%rsp)";
  99 $_end="16*$SZ+2*8(%rsp)";
 100 $_key="16*$SZ+3*8(%rsp)";
 101 $_ivp="16*$SZ+4*8(%rsp)";
 102 $_ctx="16*$SZ+5*8(%rsp)";
 103 $_in0="16*$SZ+6*8(%rsp)";
 104 $_rsp="16*$SZ+7*8(%rsp)";
 105 $framesz=16*$SZ+8*8;
 106
 107 $code=<<___;
 108 .text
 109
 110 .extern OPENSSL_ia32cap_P
 111 .globl  $func
 112 .type   $func,\@abi-omnipotent
 113 .align  16
 114 $func:
 115 ___
 116                                                 if ($avx) {
 117 $code.=<<___;
 118         lea     OPENSSL_ia32cap_P(%rip),%r11
 119         mov     \$1,%eax
 120         cmp     \$0,`$win64?"%rcx":"%rdi"`
 121         je      .Lprobe
 122         mov     0(%r11),%eax
 123         mov     4(%r11),%r10
 124 ___
 125 $code.=<<___ if ($shaext);
 126         bt      \$61,%r10                       # check for SHA
 127         jc      ${func}_shaext
 128 ___
 129 $code.=<<___;
 130         mov     %r10,%r11
 131         shr     \$32,%r11
 132
 133         test    \$`1<<11`,%r10d                 # check for XOP
 134         jnz     ${func}_xop
 135 ___
 136 $code.=<<___ if ($avx>1);
 137         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
 138         cmp     \$`1<<8|1<<5|1<<3`,%r11d
 139         je      ${func}_avx2
 140 ___
 141 $code.=<<___;
 142         and     \$`1<<30`,%eax                  # mask "Intel CPU" bit
 143         and     \$`1<<28|1<<9`,%r10d            # mask AVX+SSSE3 bits
 144         or      %eax,%r10d
 145         cmp     \$`1<<28|1<<9|1<<30`,%r10d
 146         je      ${func}_avx
 147         ud2
 148 ___
 149                                                 }
 150 $code.=<<___;
 151         xor     %eax,%eax
 152         cmp     \$0,`$win64?"%rcx":"%rdi"`
 153         je      .Lprobe
 154         ud2
 155 .Lprobe:
 156         ret
 157 .size   $func,.-$func
 158
 159 .align  64
 160 .type   $TABLE,\@object
 161 $TABLE:
 162         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 163         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 164         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 165         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 166         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 167         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 168         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 169         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 170         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 171         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 172         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 173         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 174         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 175         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 176         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 177         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 178         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 179         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 180         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 181         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 182         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 183         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 184         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 185         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 186         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 187         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 188         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 189         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 190         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 191         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 192         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 193         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 194
 195         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 196         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 197         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 198         .long   0,0,0,0,   0,0,0,0
 199         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 200 .align  64
 201 ___
 202
 203 ######################################################################
 204 # SIMD code paths
 205 #
 206 {{{
 207 ($iv,$inout,$roundkey,$temp,
 208  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
 209
 210 $aesni_cbc_idx=0;
 211 @aesni_cbc_block = (
 212 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
 213 ##      &vmovdqu        ($inout,($inp));
 214 ##      &mov            ($_inp,$inp);
 215
 216         '&vpxor         ($inout,$inout,$roundkey);'.
 217         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
 218
 219         '&vpxor         ($inout,$inout,$iv);',
 220
 221         '&vaesenc       ($inout,$inout,$roundkey);'.
 222         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
 223
 224         '&vaesenc       ($inout,$inout,$roundkey);'.
 225         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
 226
 227         '&vaesenc       ($inout,$inout,$roundkey);'.
 228         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
 229
 230         '&vaesenc       ($inout,$inout,$roundkey);'.
 231         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
 232
 233         '&vaesenc       ($inout,$inout,$roundkey);'.
 234         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
 235
 236         '&vaesenc       ($inout,$inout,$roundkey);'.
 237         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
 238
 239         '&vaesenc       ($inout,$inout,$roundkey);'.
 240         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
 241
 242         '&vaesenc       ($inout,$inout,$roundkey);'.
 243         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
 244
 245         '&vaesenc       ($inout,$inout,$roundkey);'.
 246         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
 247
 248         '&vaesenclast   ($temp,$inout,$roundkey);'.
 249         ' &vaesenc      ($inout,$inout,$roundkey);'.
 250         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
 251
 252         '&vpand         ($iv,$temp,$mask10);'.
 253         ' &vaesenc      ($inout,$inout,$roundkey);'.
 254         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
 255
 256         '&vaesenclast   ($temp,$inout,$roundkey);'.
 257         ' &vaesenc      ($inout,$inout,$roundkey);'.
 258         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
 259
 260         '&vpand         ($temp,$temp,$mask12);'.
 261         ' &vaesenc      ($inout,$inout,$roundkey);'.
 262          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
 263
 264         '&vpor          ($iv,$iv,$temp);'.
 265         ' &vaesenclast  ($temp,$inout,$roundkey);'.
 266         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
 267
 268 ##      &mov            ($inp,$_inp);
 269 ##      &mov            ($out,$_out);
 270 ##      &vpand          ($temp,$temp,$mask14);
 271 ##      &vpor           ($iv,$iv,$temp);
 272 ##      &vmovdqu        ($iv,($out,$inp);
 273 ##      &lea            (inp,16($inp));
 274 );
 275
 276 my $a4=$T1;
 277 my ($a,$b,$c,$d,$e,$f,$g,$h);
 278
 279 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 280 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 281   my $arg = pop;
 282     $arg = "\$$arg" if ($arg*1 eq $arg);
 283     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 284 }
 285
 286 sub body_00_15 () {
 287         (
 288         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 289
 290         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
 291         '&mov   ($a,$a1)',
 292         '&mov   ($a4,$f)',
 293
 294         '&xor   ($a0,$e)',
 295         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
 296         '&xor   ($a4,$g)',                      # f^g
 297
 298         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
 299         '&xor   ($a1,$a)',
 300         '&and   ($a4,$e)',                      # (f^g)&e
 301
 302         @aesni_cbc_block[$aesni_cbc_idx++].
 303         '&xor   ($a0,$e)',
 304         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
 305         '&mov   ($a2,$a)',
 306
 307         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
 308         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
 309         '&xor   ($a2,$b)',                      # a^b, b^c in next round
 310
 311         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
 312         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
 313         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
 314
 315         '&xor   ($a1,$a)',
 316         '&add   ($h,$a0)',                      # h+=Sigma1(e)
 317         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
 318
 319         '&add   ($d,$h)',                       # d+=h
 320         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
 321         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
 322
 323         '&mov   ($a0,$d)',
 324         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
 325         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 326         );
 327 }
 328
 329 if ($avx) {{
 330 ######################################################################
 331 # XOP code path
 332 #
 333 $code.=<<___;
 334 .type   ${func}_xop,\@function,6
 335 .align  64
 336 ${func}_xop:
 337 .Lxop_shortcut:
 338         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 339         push    %rbx
 340         push    %rbp
 341         push    %r12
 342         push    %r13
 343         push    %r14
 344         push    %r15
 345         mov     %rsp,%r11               # copy %rsp
 346         sub     \$`$framesz+$win64*16*10`,%rsp
 347         and     \$-64,%rsp              # align stack frame
 348
 349         shl     \$6,$len
 350         sub     $inp,$out               # re-bias
 351         sub     $inp,$in0
 352         add     $inp,$len               # end of input
 353
 354         #mov    $inp,$_inp              # saved later
 355         mov     $out,$_out
 356         mov     $len,$_end
 357         #mov    $key,$_key              # remains resident in $inp register
 358         mov     $ivp,$_ivp
 359         mov     $ctx,$_ctx
 360         mov     $in0,$_in0
 361         mov     %r11,$_rsp
 362 ___
 363 $code.=<<___ if ($win64);
 364         movaps  %xmm6,`$framesz+16*0`(%rsp)
 365         movaps  %xmm7,`$framesz+16*1`(%rsp)
 366         movaps  %xmm8,`$framesz+16*2`(%rsp)
 367         movaps  %xmm9,`$framesz+16*3`(%rsp)
 368         movaps  %xmm10,`$framesz+16*4`(%rsp)
 369         movaps  %xmm11,`$framesz+16*5`(%rsp)
 370         movaps  %xmm12,`$framesz+16*6`(%rsp)
 371         movaps  %xmm13,`$framesz+16*7`(%rsp)
 372         movaps  %xmm14,`$framesz+16*8`(%rsp)
 373         movaps  %xmm15,`$framesz+16*9`(%rsp)
 374 ___
 375 $code.=<<___;
 376 .Lprologue_xop:
 377         vzeroall
 378
 379         mov     $inp,%r12               # borrow $a4
 380         lea     0x80($key),$inp         # size optimization, reassign
 381         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 382         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 383         mov     $ctx,%r15               # borrow $a2
 384         mov     $in0,%rsi               # borrow $a3
 385         vmovdqu ($ivp),$iv              # load IV
 386         sub     \$9,%r14
 387
 388         mov     $SZ*0(%r15),$A
 389         mov     $SZ*1(%r15),$B
 390         mov     $SZ*2(%r15),$C
 391         mov     $SZ*3(%r15),$D
 392         mov     $SZ*4(%r15),$E
 393         mov     $SZ*5(%r15),$F
 394         mov     $SZ*6(%r15),$G
 395         mov     $SZ*7(%r15),$H
 396
 397         vmovdqa 0x00(%r13,%r14,8),$mask14
 398         vmovdqa 0x10(%r13,%r14,8),$mask12
 399         vmovdqa 0x20(%r13,%r14,8),$mask10
 400         vmovdqu 0x00-0x80($inp),$roundkey
 401         jmp     .Lloop_xop
 402 ___
 403                                         if ($SZ==4) {   # SHA256
 404     my @X = map("%xmm$_",(0..3));
 405     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 406
 407 $code.=<<___;
 408 .align  16
 409 .Lloop_xop:
 410         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 411         vmovdqu 0x00(%rsi,%r12),@X[0]
 412         vmovdqu 0x10(%rsi,%r12),@X[1]
 413         vmovdqu 0x20(%rsi,%r12),@X[2]
 414         vmovdqu 0x30(%rsi,%r12),@X[3]
 415         vpshufb $t3,@X[0],@X[0]
 416         lea     $TABLE(%rip),$Tbl
 417         vpshufb $t3,@X[1],@X[1]
 418         vpshufb $t3,@X[2],@X[2]
 419         vpaddd  0x00($Tbl),@X[0],$t0
 420         vpshufb $t3,@X[3],@X[3]
 421         vpaddd  0x20($Tbl),@X[1],$t1
 422         vpaddd  0x40($Tbl),@X[2],$t2
 423         vpaddd  0x60($Tbl),@X[3],$t3
 424         vmovdqa $t0,0x00(%rsp)
 425         mov     $A,$a1
 426         vmovdqa $t1,0x10(%rsp)
 427         mov     $B,$a3
 428         vmovdqa $t2,0x20(%rsp)
 429         xor     $C,$a3                  # magic
 430         vmovdqa $t3,0x30(%rsp)
 431         mov     $E,$a0
 432         jmp     .Lxop_00_47
 433
 434 .align  16
 435 .Lxop_00_47:
 436         sub     \$-16*2*$SZ,$Tbl        # size optimization
 437         vmovdqu (%r12),$inout           # $a4
 438         mov     %r12,$_inp              # $a4
 439 ___
 440 sub XOP_256_00_47 () {
 441 my $j = shift;
 442 my $body = shift;
 443 my @X = @_;
 444 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 445
 446         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
 447           eval(shift(@insns));
 448           eval(shift(@insns));
 449          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
 450           eval(shift(@insns));
 451           eval(shift(@insns));
 452         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
 453           eval(shift(@insns));
 454           eval(shift(@insns));
 455         &vpsrld         ($t0,$t0,$sigma0[2]);
 456           eval(shift(@insns));
 457           eval(shift(@insns));
 458          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
 459           eval(shift(@insns));
 460           eval(shift(@insns));
 461           eval(shift(@insns));
 462           eval(shift(@insns));
 463         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
 464           eval(shift(@insns));
 465           eval(shift(@insns));
 466         &vpxor          ($t0,$t0,$t1);
 467           eval(shift(@insns));
 468           eval(shift(@insns));
 469           eval(shift(@insns));
 470           eval(shift(@insns));
 471          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
 472           eval(shift(@insns));
 473           eval(shift(@insns));
 474         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
 475           eval(shift(@insns));
 476           eval(shift(@insns));
 477          &vpsrld        ($t2,@X[3],$sigma1[2]);
 478           eval(shift(@insns));
 479           eval(shift(@insns));
 480         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
 481           eval(shift(@insns));
 482           eval(shift(@insns));
 483          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 484           eval(shift(@insns));
 485           eval(shift(@insns));
 486          &vpxor         ($t3,$t3,$t2);
 487           eval(shift(@insns));
 488           eval(shift(@insns));
 489           eval(shift(@insns));
 490           eval(shift(@insns));
 491          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
 492           eval(shift(@insns));
 493           eval(shift(@insns));
 494           eval(shift(@insns));
 495           eval(shift(@insns));
 496         &vpsrldq        ($t3,$t3,8);
 497           eval(shift(@insns));
 498           eval(shift(@insns));
 499           eval(shift(@insns));
 500           eval(shift(@insns));
 501         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
 502           eval(shift(@insns));
 503           eval(shift(@insns));
 504           eval(shift(@insns));
 505           eval(shift(@insns));
 506          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
 507           eval(shift(@insns));
 508           eval(shift(@insns));
 509          &vpsrld        ($t2,@X[0],$sigma1[2]);
 510           eval(shift(@insns));
 511           eval(shift(@insns));
 512          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 513           eval(shift(@insns));
 514           eval(shift(@insns));
 515          &vpxor         ($t3,$t3,$t2);
 516           eval(shift(@insns));
 517           eval(shift(@insns));
 518           eval(shift(@insns));
 519           eval(shift(@insns));
 520          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
 521           eval(shift(@insns));
 522           eval(shift(@insns));
 523           eval(shift(@insns));
 524           eval(shift(@insns));
 525         &vpslldq        ($t3,$t3,8);            # 22 instructions
 526           eval(shift(@insns));
 527           eval(shift(@insns));
 528           eval(shift(@insns));
 529           eval(shift(@insns));
 530         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
 531           eval(shift(@insns));
 532           eval(shift(@insns));
 533           eval(shift(@insns));
 534           eval(shift(@insns));
 535         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 536           foreach (@insns) { eval; }            # remaining instructions
 537         &vmovdqa        (16*$j."(%rsp)",$t2);
 538 }
 539
 540     $aesni_cbc_idx=0;
 541     for ($i=0,$j=0; $j<4; $j++) {
 542         &XOP_256_00_47($j,\&body_00_15,@X);
 543         push(@X,shift(@X));                     # rotate(@X)
 544     }
 545         &mov            ("%r12",$_inp);         # borrow $a4
 546         &vpand          ($temp,$temp,$mask14);
 547         &mov            ("%r15",$_out);         # borrow $a2
 548         &vpor           ($iv,$iv,$temp);
 549         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 550         &lea            ("%r12","16(%r12)");    # inp++
 551
 552         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 553         &jne    (".Lxop_00_47");
 554
 555         &vmovdqu        ($inout,"(%r12)");
 556         &mov            ($_inp,"%r12");
 557
 558     $aesni_cbc_idx=0;
 559     for ($i=0; $i<16; ) {
 560         foreach(body_00_15()) { eval; }
 561     }
 562                                         }
 563 $code.=<<___;
 564         mov     $_inp,%r12              # borrow $a4
 565         mov     $_out,%r13              # borrow $a0
 566         mov     $_ctx,%r15              # borrow $a2
 567         mov     $_in0,%rsi              # borrow $a3
 568
 569         vpand   $mask14,$temp,$temp
 570         mov     $a1,$A
 571         vpor    $temp,$iv,$iv
 572         vmovdqu $iv,(%r13,%r12)         # write output
 573         lea     16(%r12),%r12           # inp++
 574
 575         add     $SZ*0(%r15),$A
 576         add     $SZ*1(%r15),$B
 577         add     $SZ*2(%r15),$C
 578         add     $SZ*3(%r15),$D
 579         add     $SZ*4(%r15),$E
 580         add     $SZ*5(%r15),$F
 581         add     $SZ*6(%r15),$G
 582         add     $SZ*7(%r15),$H
 583
 584         cmp     $_end,%r12
 585
 586         mov     $A,$SZ*0(%r15)
 587         mov     $B,$SZ*1(%r15)
 588         mov     $C,$SZ*2(%r15)
 589         mov     $D,$SZ*3(%r15)
 590         mov     $E,$SZ*4(%r15)
 591         mov     $F,$SZ*5(%r15)
 592         mov     $G,$SZ*6(%r15)
 593         mov     $H,$SZ*7(%r15)
 594
 595         jb      .Lloop_xop
 596
 597         mov     $_ivp,$ivp
 598         mov     $_rsp,%rsi
 599         vmovdqu $iv,($ivp)              # output IV
 600         vzeroall
 601 ___
 602 $code.=<<___ if ($win64);
 603         movaps  `$framesz+16*0`(%rsp),%xmm6
 604         movaps  `$framesz+16*1`(%rsp),%xmm7
 605         movaps  `$framesz+16*2`(%rsp),%xmm8
 606         movaps  `$framesz+16*3`(%rsp),%xmm9
 607         movaps  `$framesz+16*4`(%rsp),%xmm10
 608         movaps  `$framesz+16*5`(%rsp),%xmm11
 609         movaps  `$framesz+16*6`(%rsp),%xmm12
 610         movaps  `$framesz+16*7`(%rsp),%xmm13
 611         movaps  `$framesz+16*8`(%rsp),%xmm14
 612         movaps  `$framesz+16*9`(%rsp),%xmm15
 613 ___
 614 $code.=<<___;
 615         mov     (%rsi),%r15
 616         mov     8(%rsi),%r14
 617         mov     16(%rsi),%r13
 618         mov     24(%rsi),%r12
 619         mov     32(%rsi),%rbp
 620         mov     40(%rsi),%rbx
 621         lea     48(%rsi),%rsp
 622 .Lepilogue_xop:
 623         ret
 624 .size   ${func}_xop,.-${func}_xop
 625 ___
 626 ######################################################################
 627 # AVX+shrd code path
 628 #
 629 local *ror = sub { &shrd(@_[0],@_) };
 630
 631 $code.=<<___;
 632 .type   ${func}_avx,\@function,6
 633 .align  64
 634 ${func}_avx:
 635 .Lavx_shortcut:
 636         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 637         push    %rbx
 638         push    %rbp
 639         push    %r12
 640         push    %r13
 641         push    %r14
 642         push    %r15
 643         mov     %rsp,%r11               # copy %rsp
 644         sub     \$`$framesz+$win64*16*10`,%rsp
 645         and     \$-64,%rsp              # align stack frame
 646
 647         shl     \$6,$len
 648         sub     $inp,$out               # re-bias
 649         sub     $inp,$in0
 650         add     $inp,$len               # end of input
 651
 652         #mov    $inp,$_inp              # saved later
 653         mov     $out,$_out
 654         mov     $len,$_end
 655         #mov    $key,$_key              # remains resident in $inp register
 656         mov     $ivp,$_ivp
 657         mov     $ctx,$_ctx
 658         mov     $in0,$_in0
 659         mov     %r11,$_rsp
 660 ___
 661 $code.=<<___ if ($win64);
 662         movaps  %xmm6,`$framesz+16*0`(%rsp)
 663         movaps  %xmm7,`$framesz+16*1`(%rsp)
 664         movaps  %xmm8,`$framesz+16*2`(%rsp)
 665         movaps  %xmm9,`$framesz+16*3`(%rsp)
 666         movaps  %xmm10,`$framesz+16*4`(%rsp)
 667         movaps  %xmm11,`$framesz+16*5`(%rsp)
 668         movaps  %xmm12,`$framesz+16*6`(%rsp)
 669         movaps  %xmm13,`$framesz+16*7`(%rsp)
 670         movaps  %xmm14,`$framesz+16*8`(%rsp)
 671         movaps  %xmm15,`$framesz+16*9`(%rsp)
 672 ___
 673 $code.=<<___;
 674 .Lprologue_avx:
 675         vzeroall
 676
 677         mov     $inp,%r12               # borrow $a4
 678         lea     0x80($key),$inp         # size optimization, reassign
 679         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 680         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 681         mov     $ctx,%r15               # borrow $a2
 682         mov     $in0,%rsi               # borrow $a3
 683         vmovdqu ($ivp),$iv              # load IV
 684         sub     \$9,%r14
 685
 686         mov     $SZ*0(%r15),$A
 687         mov     $SZ*1(%r15),$B
 688         mov     $SZ*2(%r15),$C
 689         mov     $SZ*3(%r15),$D
 690         mov     $SZ*4(%r15),$E
 691         mov     $SZ*5(%r15),$F
 692         mov     $SZ*6(%r15),$G
 693         mov     $SZ*7(%r15),$H
 694
 695         vmovdqa 0x00(%r13,%r14,8),$mask14
 696         vmovdqa 0x10(%r13,%r14,8),$mask12
 697         vmovdqa 0x20(%r13,%r14,8),$mask10
 698         vmovdqu 0x00-0x80($inp),$roundkey
 699 ___
 700                                         if ($SZ==4) {   # SHA256
 701     my @X = map("%xmm$_",(0..3));
 702     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 703
 704 $code.=<<___;
 705         jmp     .Lloop_avx
 706 .align  16
 707 .Lloop_avx:
 708         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 709         vmovdqu 0x00(%rsi,%r12),@X[0]
 710         vmovdqu 0x10(%rsi,%r12),@X[1]
 711         vmovdqu 0x20(%rsi,%r12),@X[2]
 712         vmovdqu 0x30(%rsi,%r12),@X[3]
 713         vpshufb $t3,@X[0],@X[0]
 714         lea     $TABLE(%rip),$Tbl
 715         vpshufb $t3,@X[1],@X[1]
 716         vpshufb $t3,@X[2],@X[2]
 717         vpaddd  0x00($Tbl),@X[0],$t0
 718         vpshufb $t3,@X[3],@X[3]
 719         vpaddd  0x20($Tbl),@X[1],$t1
 720         vpaddd  0x40($Tbl),@X[2],$t2
 721         vpaddd  0x60($Tbl),@X[3],$t3
 722         vmovdqa $t0,0x00(%rsp)
 723         mov     $A,$a1
 724         vmovdqa $t1,0x10(%rsp)
 725         mov     $B,$a3
 726         vmovdqa $t2,0x20(%rsp)
 727         xor     $C,$a3                  # magic
 728         vmovdqa $t3,0x30(%rsp)
 729         mov     $E,$a0
 730         jmp     .Lavx_00_47
 731
 732 .align  16
 733 .Lavx_00_47:
 734         sub     \$-16*2*$SZ,$Tbl        # size optimization
 735         vmovdqu (%r12),$inout           # $a4
 736         mov     %r12,$_inp              # $a4
 737 ___
 738 sub Xupdate_256_AVX () {
 739         (
 740         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
 741          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
 742         '&vpsrld        ($t2,$t0,$sigma0[0]);',
 743          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
 744         '&vpsrld        ($t3,$t0,$sigma0[2])',
 745         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
 746         '&vpxor         ($t0,$t3,$t2)',
 747          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
 748         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
 749         '&vpxor         ($t0,$t0,$t1)',
 750         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
 751         '&vpxor         ($t0,$t0,$t2)',
 752          '&vpsrld       ($t2,$t3,$sigma1[2]);',
 753         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
 754          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
 755         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
 756          '&vpxor        ($t2,$t2,$t3);',
 757          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 758          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
 759          '&vpshufd      ($t2,$t2,0b10000100)',
 760          '&vpsrldq      ($t2,$t2,8)',
 761         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
 762          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
 763          '&vpsrld       ($t2,$t3,$sigma1[2])',
 764          '&vpsrlq       ($t3,$t3,$sigma1[0])',
 765          '&vpxor        ($t2,$t2,$t3);',
 766          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 767          '&vpxor        ($t2,$t2,$t3)',
 768          '&vpshufd      ($t2,$t2,0b11101000)',
 769          '&vpslldq      ($t2,$t2,8)',
 770         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
 771         );
 772 }
 773
 774 sub AVX_256_00_47 () {
 775 my $j = shift;
 776 my $body = shift;
 777 my @X = @_;
 778 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 779
 780         foreach (Xupdate_256_AVX()) {           # 29 instructions
 781             eval;
 782             eval(shift(@insns));
 783             eval(shift(@insns));
 784             eval(shift(@insns));
 785         }
 786         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 787           foreach (@insns) { eval; }            # remaining instructions
 788         &vmovdqa        (16*$j."(%rsp)",$t2);
 789 }
 790
 791     $aesni_cbc_idx=0;
 792     for ($i=0,$j=0; $j<4; $j++) {
 793         &AVX_256_00_47($j,\&body_00_15,@X);
 794         push(@X,shift(@X));                     # rotate(@X)
 795     }
 796         &mov            ("%r12",$_inp);         # borrow $a4
 797         &vpand          ($temp,$temp,$mask14);
 798         &mov            ("%r15",$_out);         # borrow $a2
 799         &vpor           ($iv,$iv,$temp);
 800         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 801         &lea            ("%r12","16(%r12)");    # inp++
 802
 803         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 804         &jne    (".Lavx_00_47");
 805
 806         &vmovdqu        ($inout,"(%r12)");
 807         &mov            ($_inp,"%r12");
 808
 809     $aesni_cbc_idx=0;
 810     for ($i=0; $i<16; ) {
 811         foreach(body_00_15()) { eval; }
 812     }
 813
 814                                         }
 815 $code.=<<___;
 816         mov     $_inp,%r12              # borrow $a4
 817         mov     $_out,%r13              # borrow $a0
 818         mov     $_ctx,%r15              # borrow $a2
 819         mov     $_in0,%rsi              # borrow $a3
 820
 821         vpand   $mask14,$temp,$temp
 822         mov     $a1,$A
 823         vpor    $temp,$iv,$iv
 824         vmovdqu $iv,(%r13,%r12)         # write output
 825         lea     16(%r12),%r12           # inp++
 826
 827         add     $SZ*0(%r15),$A
 828         add     $SZ*1(%r15),$B
 829         add     $SZ*2(%r15),$C
 830         add     $SZ*3(%r15),$D
 831         add     $SZ*4(%r15),$E
 832         add     $SZ*5(%r15),$F
 833         add     $SZ*6(%r15),$G
 834         add     $SZ*7(%r15),$H
 835
 836         cmp     $_end,%r12
 837
 838         mov     $A,$SZ*0(%r15)
 839         mov     $B,$SZ*1(%r15)
 840         mov     $C,$SZ*2(%r15)
 841         mov     $D,$SZ*3(%r15)
 842         mov     $E,$SZ*4(%r15)
 843         mov     $F,$SZ*5(%r15)
 844         mov     $G,$SZ*6(%r15)
 845         mov     $H,$SZ*7(%r15)
 846         jb      .Lloop_avx
 847
 848         mov     $_ivp,$ivp
 849         mov     $_rsp,%rsi
 850         vmovdqu $iv,($ivp)              # output IV
 851         vzeroall
 852 ___
 853 $code.=<<___ if ($win64);
 854         movaps  `$framesz+16*0`(%rsp),%xmm6
 855         movaps  `$framesz+16*1`(%rsp),%xmm7
 856         movaps  `$framesz+16*2`(%rsp),%xmm8
 857         movaps  `$framesz+16*3`(%rsp),%xmm9
 858         movaps  `$framesz+16*4`(%rsp),%xmm10
 859         movaps  `$framesz+16*5`(%rsp),%xmm11
 860         movaps  `$framesz+16*6`(%rsp),%xmm12
 861         movaps  `$framesz+16*7`(%rsp),%xmm13
 862         movaps  `$framesz+16*8`(%rsp),%xmm14
 863         movaps  `$framesz+16*9`(%rsp),%xmm15
 864 ___
 865 $code.=<<___;
 866         mov     (%rsi),%r15
 867         mov     8(%rsi),%r14
 868         mov     16(%rsi),%r13
 869         mov     24(%rsi),%r12
 870         mov     32(%rsi),%rbp
 871         mov     40(%rsi),%rbx
 872         lea     48(%rsi),%rsp
 873 .Lepilogue_avx:
 874         ret
 875 .size   ${func}_avx,.-${func}_avx
 876 ___
 877
 878 if ($avx>1) {{
 879 ######################################################################
 880 # AVX2+BMI code path
 881 #
 882 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
 883 my $PUSH8=8*2*$SZ;
 884 use integer;
 885
 886 sub bodyx_00_15 () {
 887         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 888         (
 889         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 890
 891         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 892         '&and   ($a4,$e)',              # f&e
 893         '&rorx  ($a0,$e,$Sigma1[2])',
 894         '&rorx  ($a2,$e,$Sigma1[1])',
 895
 896         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
 897         '&lea   ($h,"($h,$a4)")',
 898         '&andn  ($a4,$e,$g)',           # ~e&g
 899         '&xor   ($a0,$a2)',
 900
 901         '&rorx  ($a1,$e,$Sigma1[0])',
 902         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
 903         '&xor   ($a0,$a1)',             # Sigma1(e)
 904         '&mov   ($a2,$a)',
 905
 906         '&rorx  ($a4,$a,$Sigma0[2])',
 907         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
 908         '&xor   ($a2,$b)',              # a^b, b^c in next round
 909         '&rorx  ($a1,$a,$Sigma0[1])',
 910
 911         '&rorx  ($a0,$a,$Sigma0[0])',
 912         '&lea   ($d,"($d,$h)")',        # d+=h
 913         '&and   ($a3,$a2)',             # (b^c)&(a^b)
 914         @aesni_cbc_block[$aesni_cbc_idx++].
 915         '&xor   ($a1,$a4)',
 916
 917         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
 918         '&xor   ($a1,$a0)',             # Sigma0(a)
 919         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
 920         '&mov   ($a4,$e)',              # copy of f in future
 921
 922         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 923         );
 924         # and at the finish one has to $a+=$a1
 925 }
 926
 927 $code.=<<___;
 928 .type   ${func}_avx2,\@function,6
 929 .align  64
 930 ${func}_avx2:
 931 .Lavx2_shortcut:
 932         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 933         push    %rbx
 934         push    %rbp
 935         push    %r12
 936         push    %r13
 937         push    %r14
 938         push    %r15
 939         mov     %rsp,%r11               # copy %rsp
 940         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 941         and     \$-256*$SZ,%rsp         # align stack frame
 942         add     \$`2*$SZ*($rounds-8)`,%rsp
 943
 944         shl     \$6,$len
 945         sub     $inp,$out               # re-bias
 946         sub     $inp,$in0
 947         add     $inp,$len               # end of input
 948
 949         #mov    $inp,$_inp              # saved later
 950         #mov    $out,$_out              # kept in $offload
 951         mov     $len,$_end
 952         #mov    $key,$_key              # remains resident in $inp register
 953         mov     $ivp,$_ivp
 954         mov     $ctx,$_ctx
 955         mov     $in0,$_in0
 956         mov     %r11,$_rsp
 957 ___
 958 $code.=<<___ if ($win64);
 959         movaps  %xmm6,`$framesz+16*0`(%rsp)
 960         movaps  %xmm7,`$framesz+16*1`(%rsp)
 961         movaps  %xmm8,`$framesz+16*2`(%rsp)
 962         movaps  %xmm9,`$framesz+16*3`(%rsp)
 963         movaps  %xmm10,`$framesz+16*4`(%rsp)
 964         movaps  %xmm11,`$framesz+16*5`(%rsp)
 965         movaps  %xmm12,`$framesz+16*6`(%rsp)
 966         movaps  %xmm13,`$framesz+16*7`(%rsp)
 967         movaps  %xmm14,`$framesz+16*8`(%rsp)
 968         movaps  %xmm15,`$framesz+16*9`(%rsp)
 969 ___
 970 $code.=<<___;
 971 .Lprologue_avx2:
 972         vzeroall
 973
 974         mov     $inp,%r13               # borrow $a0
 975         vpinsrq \$1,$out,$offload,$offload
 976         lea     0x80($key),$inp         # size optimization, reassign
 977         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
 978         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 979         mov     $ctx,%r15               # borrow $a2
 980         mov     $in0,%rsi               # borrow $a3
 981         vmovdqu ($ivp),$iv              # load IV
 982         lea     -9(%r14),%r14
 983
 984         vmovdqa 0x00(%r12,%r14,8),$mask14
 985         vmovdqa 0x10(%r12,%r14,8),$mask12
 986         vmovdqa 0x20(%r12,%r14,8),$mask10
 987
 988         sub     \$-16*$SZ,%r13          # inp++, size optimization
 989         mov     $SZ*0(%r15),$A
 990         lea     (%rsi,%r13),%r12        # borrow $a0
 991         mov     $SZ*1(%r15),$B
 992         cmp     $len,%r13               # $_end
 993         mov     $SZ*2(%r15),$C
 994         cmove   %rsp,%r12               # next block or random data
 995         mov     $SZ*3(%r15),$D
 996         mov     $SZ*4(%r15),$E
 997         mov     $SZ*5(%r15),$F
 998         mov     $SZ*6(%r15),$G
 999         mov     $SZ*7(%r15),$H
1000         vmovdqu 0x00-0x80($inp),$roundkey
1001 ___
1002                                         if ($SZ==4) {   # SHA256
1003     my @X = map("%ymm$_",(0..3));
1004     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1005
1006 $code.=<<___;
1007         jmp     .Loop_avx2
1008 .align  16
1009 .Loop_avx2:
1010         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1011         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1012         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1013         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1014         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1015
1016         vinserti128     \$1,(%r12),@X[0],@X[0]
1017         vinserti128     \$1,16(%r12),@X[1],@X[1]
1018          vpshufb        $t3,@X[0],@X[0]
1019         vinserti128     \$1,32(%r12),@X[2],@X[2]
1020          vpshufb        $t3,@X[1],@X[1]
1021         vinserti128     \$1,48(%r12),@X[3],@X[3]
1022
1023         lea     $TABLE(%rip),$Tbl
1024         vpshufb $t3,@X[2],@X[2]
1025         lea     -16*$SZ(%r13),%r13
1026         vpaddd  0x00($Tbl),@X[0],$t0
1027         vpshufb $t3,@X[3],@X[3]
1028         vpaddd  0x20($Tbl),@X[1],$t1
1029         vpaddd  0x40($Tbl),@X[2],$t2
1030         vpaddd  0x60($Tbl),@X[3],$t3
1031         vmovdqa $t0,0x00(%rsp)
1032         xor     $a1,$a1
1033         vmovdqa $t1,0x20(%rsp)
1034         lea     -$PUSH8(%rsp),%rsp
1035         mov     $B,$a3
1036         vmovdqa $t2,0x00(%rsp)
1037         xor     $C,$a3                  # magic
1038         vmovdqa $t3,0x20(%rsp)
1039         mov     $F,$a4
1040         sub     \$-16*2*$SZ,$Tbl        # size optimization
1041         jmp     .Lavx2_00_47
1042
1043 .align  16
1044 .Lavx2_00_47:
1045         vmovdqu (%r13),$inout
1046         vpinsrq \$0,%r13,$offload,$offload
1047 ___
1048
1049 sub AVX2_256_00_47 () {
1050 my $j = shift;
1051 my $body = shift;
1052 my @X = @_;
1053 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1054 my $base = "+2*$PUSH8(%rsp)";
1055
1056         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1057         foreach (Xupdate_256_AVX()) {           # 29 instructions
1058             eval;
1059             eval(shift(@insns));
1060             eval(shift(@insns));
1061             eval(shift(@insns));
1062         }
1063         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1064           foreach (@insns) { eval; }            # remaining instructions
1065         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1066 }
1067     $aesni_cbc_idx=0;
1068     for ($i=0,$j=0; $j<4; $j++) {
1069         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1070         push(@X,shift(@X));                     # rotate(@X)
1071     }
1072         &vmovq          ("%r13",$offload);      # borrow $a0
1073         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1074         &vpand          ($temp,$temp,$mask14);
1075         &vpor           ($iv,$iv,$temp);
1076         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1077         &lea            ("%r13","16(%r13)");    # inp++
1078
1079         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1080         &cmpb   (($SZ-1)."($Tbl)",0);
1081         &jne    (".Lavx2_00_47");
1082
1083         &vmovdqu        ($inout,"(%r13)");
1084         &vpinsrq        ($offload,$offload,"%r13",0);
1085
1086     $aesni_cbc_idx=0;
1087     for ($i=0; $i<16; ) {
1088         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1089         foreach(bodyx_00_15()) { eval; }
1090     }
1091                                         }
1092 $code.=<<___;
1093         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1094         vmovq   $offload,%r13                   # $_inp, borrow $a0
1095         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1096         add     $a1,$A
1097         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1098
1099         vpand   $mask14,$temp,$temp
1100         vpor    $temp,$iv,$iv
1101         vmovdqu $iv,(%r12,%r13)                 # write output
1102         lea     16(%r13),%r13
1103
1104         add     $SZ*0(%r15),$A
1105         add     $SZ*1(%r15),$B
1106         add     $SZ*2(%r15),$C
1107         add     $SZ*3(%r15),$D
1108         add     $SZ*4(%r15),$E
1109         add     $SZ*5(%r15),$F
1110         add     $SZ*6(%r15),$G
1111         add     $SZ*7(%r15),$H
1112
1113         mov     $A,$SZ*0(%r15)
1114         mov     $B,$SZ*1(%r15)
1115         mov     $C,$SZ*2(%r15)
1116         mov     $D,$SZ*3(%r15)
1117         mov     $E,$SZ*4(%r15)
1118         mov     $F,$SZ*5(%r15)
1119         mov     $G,$SZ*6(%r15)
1120         mov     $H,$SZ*7(%r15)
1121
1122         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1123         je      .Ldone_avx2
1124
1125         xor     $a1,$a1
1126         mov     $B,$a3
1127         mov     $F,$a4
1128         xor     $C,$a3                  # magic
1129         jmp     .Lower_avx2
1130 .align  16
1131 .Lower_avx2:
1132         vmovdqu (%r13),$inout
1133         vpinsrq \$0,%r13,$offload,$offload
1134 ___
1135     $aesni_cbc_idx=0;
1136     for ($i=0; $i<16; ) {
1137         my $base="+16($Tbl)";
1138         foreach(bodyx_00_15()) { eval; }
1139         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1140     }
1141 $code.=<<___;
1142         vmovq   $offload,%r13                   # borrow $a0
1143         vpextrq \$1,$offload,%r15               # borrow $a2
1144         vpand   $mask14,$temp,$temp
1145         vpor    $temp,$iv,$iv
1146         lea     -$PUSH8($Tbl),$Tbl
1147         vmovdqu $iv,(%r15,%r13)                 # write output
1148         lea     16(%r13),%r13                   # inp++
1149         cmp     %rsp,$Tbl
1150         jae     .Lower_avx2
1151
1152         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1153         lea     16*$SZ(%r13),%r13
1154         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1155         add     $a1,$A
1156         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1157
1158         add     $SZ*0(%r15),$A
1159         add     $SZ*1(%r15),$B
1160         add     $SZ*2(%r15),$C
1161         add     $SZ*3(%r15),$D
1162         add     $SZ*4(%r15),$E
1163         add     $SZ*5(%r15),$F
1164         add     $SZ*6(%r15),$G
1165         lea     (%rsi,%r13),%r12
1166         add     $SZ*7(%r15),$H
1167
1168         cmp     $_end,%r13
1169
1170         mov     $A,$SZ*0(%r15)
1171         cmove   %rsp,%r12               # next block or stale data
1172         mov     $B,$SZ*1(%r15)
1173         mov     $C,$SZ*2(%r15)
1174         mov     $D,$SZ*3(%r15)
1175         mov     $E,$SZ*4(%r15)
1176         mov     $F,$SZ*5(%r15)
1177         mov     $G,$SZ*6(%r15)
1178         mov     $H,$SZ*7(%r15)
1179
1180         jbe     .Loop_avx2
1181         lea     (%rsp),$Tbl
1182
1183 .Ldone_avx2:
1184         lea     ($Tbl),%rsp
1185         mov     $_ivp,$ivp
1186         mov     $_rsp,%rsi
1187         vmovdqu $iv,($ivp)              # output IV
1188         vzeroall
1189 ___
1190 $code.=<<___ if ($win64);
1191         movaps  `$framesz+16*0`(%rsp),%xmm6
1192         movaps  `$framesz+16*1`(%rsp),%xmm7
1193         movaps  `$framesz+16*2`(%rsp),%xmm8
1194         movaps  `$framesz+16*3`(%rsp),%xmm9
1195         movaps  `$framesz+16*4`(%rsp),%xmm10
1196         movaps  `$framesz+16*5`(%rsp),%xmm11
1197         movaps  `$framesz+16*6`(%rsp),%xmm12
1198         movaps  `$framesz+16*7`(%rsp),%xmm13
1199         movaps  `$framesz+16*8`(%rsp),%xmm14
1200         movaps  `$framesz+16*9`(%rsp),%xmm15
1201 ___
1202 $code.=<<___;
1203         mov     (%rsi),%r15
1204         mov     8(%rsi),%r14
1205         mov     16(%rsi),%r13
1206         mov     24(%rsi),%r12
1207         mov     32(%rsi),%rbp
1208         mov     40(%rsi),%rbx
1209         lea     48(%rsi),%rsp
1210 .Lepilogue_avx2:
1211         ret
1212 .size   ${func}_avx2,.-${func}_avx2
1213 ___
1214 }}
1215 }}
1216 {{
1217 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1218
1219 my ($rounds,$Tbl)=("%r11d","%rbx");
1220
1221 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1222 my @rndkey=("%xmm4","%xmm5");
1223 my $r=0;
1224 my $sn=0;
1225
1226 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1227 my @MSG=map("%xmm$_",(10..13));
1228
1229 my $aesenc=sub {
1230   use integer;
1231   my ($n,$k)=($r/10,$r%10);
1232     if ($k==0) {
1233       $code.=<<___;
1234         movups          `16*$n`($in0),$in               # load input
1235         xorps           $rndkey0,$in
1236 ___
1237       $code.=<<___ if ($n);
1238         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1239 ___
1240       $code.=<<___;
1241         xorps           $in,$iv
1242         movups          `32+16*$k-112`($key),$rndkey[1]
1243         aesenc          $rndkey[0],$iv
1244 ___
1245     } elsif ($k==9) {
1246       $sn++;
1247       $code.=<<___;
1248         cmp             \$11,$rounds
1249         jb              .Laesenclast$sn
1250         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1251         aesenc          $rndkey[0],$iv
1252         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1253         aesenc          $rndkey[1],$iv
1254         je              .Laesenclast$sn
1255         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1256         aesenc          $rndkey[0],$iv
1257         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1258         aesenc          $rndkey[1],$iv
1259 .Laesenclast$sn:
1260         aesenclast      $rndkey[0],$iv
1261         movups          16-112($key),$rndkey[1]         # forward reference
1262         nop
1263 ___
1264     } else {
1265       $code.=<<___;
1266         movups          `32+16*$k-112`($key),$rndkey[1]
1267         aesenc          $rndkey[0],$iv
1268 ___
1269     }
1270     $r++;       unshift(@rndkey,pop(@rndkey));
1271 };
1272
1273 if ($shaext) {
1274 my $Tbl="%rax";
1275
1276 $code.=<<___;
1277 .type   ${func}_shaext,\@function,6
1278 .align  32
1279 ${func}_shaext:
1280         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1281 ___
1282 $code.=<<___ if ($win64);
1283         lea     `-8-10*16`(%rsp),%rsp
1284         movaps  %xmm6,-8-10*16(%rax)
1285         movaps  %xmm7,-8-9*16(%rax)
1286         movaps  %xmm8,-8-8*16(%rax)
1287         movaps  %xmm9,-8-7*16(%rax)
1288         movaps  %xmm10,-8-6*16(%rax)
1289         movaps  %xmm11,-8-5*16(%rax)
1290         movaps  %xmm12,-8-4*16(%rax)
1291         movaps  %xmm13,-8-3*16(%rax)
1292         movaps  %xmm14,-8-2*16(%rax)
1293         movaps  %xmm15,-8-1*16(%rax)
1294 .Lprologue_shaext:
1295 ___
1296 $code.=<<___;
1297         lea             K256+0x80(%rip),$Tbl
1298         movdqu          ($ctx),$ABEF            # DCBA
1299         movdqu          16($ctx),$CDGH          # HGFE
1300         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1301
1302         mov             240($key),$rounds
1303         sub             $in0,$out
1304         movups          ($key),$rndkey0         # $key[0]
1305         movups          16($key),$rndkey[0]     # forward reference
1306         lea             112($key),$key          # size optimization
1307
1308         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1309         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1310         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1311         movdqa          $TMP,$BSWAP             # offload
1312         palignr         \$8,$CDGH,$ABEF         # ABEF
1313         punpcklqdq      $Wi,$CDGH               # CDGH
1314
1315         jmp     .Loop_shaext
1316
1317 .align  16
1318 .Loop_shaext:
1319         movdqu          ($inp),@MSG[0]
1320         movdqu          0x10($inp),@MSG[1]
1321         movdqu          0x20($inp),@MSG[2]
1322         pshufb          $TMP,@MSG[0]
1323         movdqu          0x30($inp),@MSG[3]
1324
1325         movdqa          0*32-0x80($Tbl),$Wi
1326         paddd           @MSG[0],$Wi
1327         pshufb          $TMP,@MSG[1]
1328         movdqa          $CDGH,$CDGH_SAVE        # offload
1329         movdqa          $ABEF,$ABEF_SAVE        # offload
1330 ___
1331         &$aesenc();
1332 $code.=<<___;
1333         sha256rnds2     $ABEF,$CDGH             # 0-3
1334         pshufd          \$0x0e,$Wi,$Wi
1335 ___
1336         &$aesenc();
1337 $code.=<<___;
1338         sha256rnds2     $CDGH,$ABEF
1339
1340         movdqa          1*32-0x80($Tbl),$Wi
1341         paddd           @MSG[1],$Wi
1342         pshufb          $TMP,@MSG[2]
1343         lea             0x40($inp),$inp
1344 ___
1345         &$aesenc();
1346 $code.=<<___;
1347         sha256rnds2     $ABEF,$CDGH             # 4-7
1348         pshufd          \$0x0e,$Wi,$Wi
1349 ___
1350         &$aesenc();
1351 $code.=<<___;
1352         sha256rnds2     $CDGH,$ABEF
1353
1354         movdqa          2*32-0x80($Tbl),$Wi
1355         paddd           @MSG[2],$Wi
1356         pshufb          $TMP,@MSG[3]
1357         sha256msg1      @MSG[1],@MSG[0]
1358 ___
1359         &$aesenc();
1360 $code.=<<___;
1361         sha256rnds2     $ABEF,$CDGH             # 8-11
1362         pshufd          \$0x0e,$Wi,$Wi
1363         movdqa          @MSG[3],$TMP
1364         palignr         \$4,@MSG[2],$TMP
1365         paddd           $TMP,@MSG[0]
1366 ___
1367         &$aesenc();
1368 $code.=<<___;
1369         sha256rnds2     $CDGH,$ABEF
1370
1371         movdqa          3*32-0x80($Tbl),$Wi
1372         paddd           @MSG[3],$Wi
1373         sha256msg2      @MSG[3],@MSG[0]
1374         sha256msg1      @MSG[2],@MSG[1]
1375 ___
1376         &$aesenc();
1377 $code.=<<___;
1378         sha256rnds2     $ABEF,$CDGH             # 12-15
1379         pshufd          \$0x0e,$Wi,$Wi
1380 ___
1381         &$aesenc();
1382 $code.=<<___;
1383         movdqa          @MSG[0],$TMP
1384         palignr         \$4,@MSG[3],$TMP
1385         paddd           $TMP,@MSG[1]
1386         sha256rnds2     $CDGH,$ABEF
1387 ___
1388 for($i=4;$i<16-3;$i++) {
1389         &$aesenc()      if (($r%10)==0);
1390 $code.=<<___;
1391         movdqa          $i*32-0x80($Tbl),$Wi
1392         paddd           @MSG[0],$Wi
1393         sha256msg2      @MSG[0],@MSG[1]
1394         sha256msg1      @MSG[3],@MSG[2]
1395 ___
1396         &$aesenc();
1397 $code.=<<___;
1398         sha256rnds2     $ABEF,$CDGH             # 16-19...
1399         pshufd          \$0x0e,$Wi,$Wi
1400         movdqa          @MSG[1],$TMP
1401         palignr         \$4,@MSG[0],$TMP
1402         paddd           $TMP,@MSG[2]
1403 ___
1404         &$aesenc();
1405         &$aesenc()      if ($r==19);
1406 $code.=<<___;
1407         sha256rnds2     $CDGH,$ABEF
1408 ___
1409         push(@MSG,shift(@MSG));
1410 }
1411 $code.=<<___;
1412         movdqa          13*32-0x80($Tbl),$Wi
1413         paddd           @MSG[0],$Wi
1414         sha256msg2      @MSG[0],@MSG[1]
1415         sha256msg1      @MSG[3],@MSG[2]
1416 ___
1417         &$aesenc();
1418 $code.=<<___;
1419         sha256rnds2     $ABEF,$CDGH             # 52-55
1420         pshufd          \$0x0e,$Wi,$Wi
1421         movdqa          @MSG[1],$TMP
1422         palignr         \$4,@MSG[0],$TMP
1423         paddd           $TMP,@MSG[2]
1424 ___
1425         &$aesenc();
1426         &$aesenc();
1427 $code.=<<___;
1428         sha256rnds2     $CDGH,$ABEF
1429
1430         movdqa          14*32-0x80($Tbl),$Wi
1431         paddd           @MSG[1],$Wi
1432         sha256msg2      @MSG[1],@MSG[2]
1433         movdqa          $BSWAP,$TMP
1434 ___
1435         &$aesenc();
1436 $code.=<<___;
1437         sha256rnds2     $ABEF,$CDGH             # 56-59
1438         pshufd          \$0x0e,$Wi,$Wi
1439 ___
1440         &$aesenc();
1441 $code.=<<___;
1442         sha256rnds2     $CDGH,$ABEF
1443
1444         movdqa          15*32-0x80($Tbl),$Wi
1445         paddd           @MSG[2],$Wi
1446 ___
1447         &$aesenc();
1448         &$aesenc();
1449 $code.=<<___;
1450         sha256rnds2     $ABEF,$CDGH             # 60-63
1451         pshufd          \$0x0e,$Wi,$Wi
1452 ___
1453         &$aesenc();
1454 $code.=<<___;
1455         sha256rnds2     $CDGH,$ABEF
1456         #pxor           $CDGH,$rndkey0          # black magic
1457 ___
1458         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1459 $code.=<<___;
1460         #xorps          $CDGH,$rndkey0          # black magic
1461         paddd           $CDGH_SAVE,$CDGH
1462         paddd           $ABEF_SAVE,$ABEF
1463
1464         dec             $len
1465         movups          $iv,48($out,$in0)       # write output
1466         lea             64($in0),$in0
1467         jnz             .Loop_shaext
1468
1469         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1470         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1471         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1472         punpckhqdq      $CDGH,$ABEF             # DCBA
1473         palignr         \$8,$TMP,$CDGH          # HGFE
1474
1475         movups          $iv,($ivp)              # write IV
1476         movdqu          $ABEF,($ctx)
1477         movdqu          $CDGH,16($ctx)
1478 ___
1479 $code.=<<___ if ($win64);
1480         movaps  0*16(%rsp),%xmm6
1481         movaps  1*16(%rsp),%xmm7
1482         movaps  2*16(%rsp),%xmm8
1483         movaps  3*16(%rsp),%xmm9
1484         movaps  4*16(%rsp),%xmm10
1485         movaps  5*16(%rsp),%xmm11
1486         movaps  6*16(%rsp),%xmm12
1487         movaps  7*16(%rsp),%xmm13
1488         movaps  8*16(%rsp),%xmm14
1489         movaps  9*16(%rsp),%xmm15
1490         lea     8+10*16(%rsp),%rsp
1491 .Lepilogue_shaext:
1492 ___
1493 $code.=<<___;
1494         ret
1495 .size   ${func}_shaext,.-${func}_shaext
1496 ___
1497 }
1498 }}}}}
1499
1500 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1501 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1502 if ($win64) {
1503 $rec="%rcx";
1504 $frame="%rdx";
1505 $context="%r8";
1506 $disp="%r9";
1507
1508 $code.=<<___ if ($avx);
1509 .extern __imp_RtlVirtualUnwind
1510 .type   se_handler,\@abi-omnipotent
1511 .align  16
1512 se_handler:
1513         push    %rsi
1514         push    %rdi
1515         push    %rbx
1516         push    %rbp
1517         push    %r12
1518         push    %r13
1519         push    %r14
1520         push    %r15
1521         pushfq
1522         sub     \$64,%rsp
1523
1524         mov     120($context),%rax      # pull context->Rax
1525         mov     248($context),%rbx      # pull context->Rip
1526
1527         mov     8($disp),%rsi           # disp->ImageBase
1528         mov     56($disp),%r11          # disp->HanderlData
1529
1530         mov     0(%r11),%r10d           # HandlerData[0]
1531         lea     (%rsi,%r10),%r10        # prologue label
1532         cmp     %r10,%rbx               # context->Rip<prologue label
1533         jb      .Lin_prologue
1534
1535         mov     152($context),%rax      # pull context->Rsp
1536
1537         mov     4(%r11),%r10d           # HandlerData[1]
1538         lea     (%rsi,%r10),%r10        # epilogue label
1539         cmp     %r10,%rbx               # context->Rip>=epilogue label
1540         jae     .Lin_prologue
1541 ___
1542 $code.=<<___ if ($shaext);
1543         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1544         cmp     %r10,%rbx
1545         jb      .Lnot_in_shaext
1546
1547         lea     (%rax),%rsi
1548         lea     512($context),%rdi      # &context.Xmm6
1549         mov     \$20,%ecx
1550         .long   0xa548f3fc              # cld; rep movsq
1551         lea     168(%rax),%rax          # adjust stack pointer
1552         jmp     .Lin_prologue
1553 .Lnot_in_shaext:
1554 ___
1555 $code.=<<___ if ($avx>1);
1556         lea     .Lavx2_shortcut(%rip),%r10
1557         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1558         jb      .Lnot_in_avx2
1559
1560         and     \$-256*$SZ,%rax
1561         add     \$`2*$SZ*($rounds-8)`,%rax
1562 .Lnot_in_avx2:
1563 ___
1564 $code.=<<___;
1565         mov     %rax,%rsi               # put aside Rsp
1566         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1567         lea     48(%rax),%rax
1568
1569         mov     -8(%rax),%rbx
1570         mov     -16(%rax),%rbp
1571         mov     -24(%rax),%r12
1572         mov     -32(%rax),%r13
1573         mov     -40(%rax),%r14
1574         mov     -48(%rax),%r15
1575         mov     %rbx,144($context)      # restore context->Rbx
1576         mov     %rbp,160($context)      # restore context->Rbp
1577         mov     %r12,216($context)      # restore context->R12
1578         mov     %r13,224($context)      # restore context->R13
1579         mov     %r14,232($context)      # restore context->R14
1580         mov     %r15,240($context)      # restore context->R15
1581
1582         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1583         lea     512($context),%rdi      # &context.Xmm6
1584         mov     \$20,%ecx
1585         .long   0xa548f3fc              # cld; rep movsq
1586
1587 .Lin_prologue:
1588         mov     8(%rax),%rdi
1589         mov     16(%rax),%rsi
1590         mov     %rax,152($context)      # restore context->Rsp
1591         mov     %rsi,168($context)      # restore context->Rsi
1592         mov     %rdi,176($context)      # restore context->Rdi
1593
1594         mov     40($disp),%rdi          # disp->ContextRecord
1595         mov     $context,%rsi           # context
1596         mov     \$154,%ecx              # sizeof(CONTEXT)
1597         .long   0xa548f3fc              # cld; rep movsq
1598
1599         mov     $disp,%rsi
1600         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1601         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1602         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1603         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1604         mov     40(%rsi),%r10           # disp->ContextRecord
1605         lea     56(%rsi),%r11           # &disp->HandlerData
1606         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1607         mov     %r10,32(%rsp)           # arg5
1608         mov     %r11,40(%rsp)           # arg6
1609         mov     %r12,48(%rsp)           # arg7
1610         mov     %rcx,56(%rsp)           # arg8, (NULL)
1611         call    *__imp_RtlVirtualUnwind(%rip)
1612
1613         mov     \$1,%eax                # ExceptionContinueSearch
1614         add     \$64,%rsp
1615         popfq
1616         pop     %r15
1617         pop     %r14
1618         pop     %r13
1619         pop     %r12
1620         pop     %rbp
1621         pop     %rbx
1622         pop     %rdi
1623         pop     %rsi
1624         ret
1625 .size   se_handler,.-se_handler
1626
1627 .section        .pdata
1628         .rva    .LSEH_begin_${func}_xop
1629         .rva    .LSEH_end_${func}_xop
1630         .rva    .LSEH_info_${func}_xop
1631
1632         .rva    .LSEH_begin_${func}_avx
1633         .rva    .LSEH_end_${func}_avx
1634         .rva    .LSEH_info_${func}_avx
1635 ___
1636 $code.=<<___ if ($avx>1);
1637         .rva    .LSEH_begin_${func}_avx2
1638         .rva    .LSEH_end_${func}_avx2
1639         .rva    .LSEH_info_${func}_avx2
1640 ___
1641 $code.=<<___ if ($shaext);
1642         .rva    .LSEH_begin_${func}_shaext
1643         .rva    .LSEH_end_${func}_shaext
1644         .rva    .LSEH_info_${func}_shaext
1645 ___
1646 $code.=<<___ if ($avx);
1647 .section        .xdata
1648 .align  8
1649 .LSEH_info_${func}_xop:
1650         .byte   9,0,0,0
1651         .rva    se_handler
1652         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1653
1654 .LSEH_info_${func}_avx:
1655         .byte   9,0,0,0
1656         .rva    se_handler
1657         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1658 ___
1659 $code.=<<___ if ($avx>1);
1660 .LSEH_info_${func}_avx2:
1661         .byte   9,0,0,0
1662         .rva    se_handler
1663         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1664 ___
1665 $code.=<<___ if ($shaext);
1666 .LSEH_info_${func}_shaext:
1667         .byte   9,0,0,0
1668         .rva    se_handler
1669         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1670 ___
1671 }
1672
1673 ####################################################################
1674 sub rex {
1675   local *opcode=shift;
1676   my ($dst,$src)=@_;
1677   my $rex=0;
1678
1679     $rex|=0x04                  if($dst>=8);
1680     $rex|=0x01                  if($src>=8);
1681     unshift @opcode,$rex|0x40   if($rex);
1682 }
1683
1684 {
1685   my %opcodelet = (
1686                 "sha256rnds2" => 0xcb,
1687                 "sha256msg1"  => 0xcc,
1688                 "sha256msg2"  => 0xcd   );
1689
1690   sub sha256op38 {
1691     my $instr = shift;
1692
1693     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1694       my @opcode=(0x0f,0x38);
1695         rex(\@opcode,$2,$1);
1696         push @opcode,$opcodelet{$instr};
1697         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1698         return ".byte\t".join(',',@opcode);
1699     } else {
1700         return $instr."\t".@_[0];
1701     }
1702   }
1703 }
1704
1705 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1706 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1707 print $code;
1708 close STDOUT;