release/src/router/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # January 2013
  11 #
  12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  15 # parallelism, interleaving it with another algorithm would allow to
  16 # utilize processor resources better and achieve better performance.
  17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  18 # AESNI code is weaved into it. As SHA256 dominates execution time,
  19 # stitch performance does not depend on AES key length. Below are
  20 # performance numbers in cycles per processed byte, less is better,
  21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  22 # subroutine:
  23 #
  24 #                AES-128/-192/-256+SHA256       this(**)gain
  25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
  26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
  27 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
  28 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
  29 #
  30 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
  31 #       Westmere is omitted from loop, this is because gain was not
  32 #       estimated high enough to justify the effort;
  33 # (**)  these are EVP-free results, results obtained with 'speed
  34 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  35
  36 $flavour = shift;
  37 $output  = shift;
  38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  39
  40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  41
  42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  45 die "can't locate x86_64-xlate.pl";
  46
  47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  48                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  49         $avx = ($1>=2.19) + ($1>=2.22);
  50 }
  51
  52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  53            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  54         $avx = ($1>=2.09) + ($1>=2.10);
  55 }
  56
  57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  58            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  59         $avx = ($1>=10) + ($1>=12);
  60 }
  61
  62 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
  63         $avx = ($2>=3.0) + ($2>3.0);
  64 }
  65
  66 $shaext=$avx;   ### set to zero if compiling for 1.0.1
  67 $avx=1          if (!$shaext && $avx);
  68
  69 open OUT,"| \"$^X\" $xlate $flavour $output";
  70 *STDOUT=*OUT;
  71
  72 $func="aesni_cbc_sha256_enc";
  73 $TABLE="K256";
  74 $SZ=4;
  75 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  76                                 "%r8d","%r9d","%r10d","%r11d");
  77 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  78 @Sigma0=( 2,13,22);
  79 @Sigma1=( 6,11,25);
  80 @sigma0=( 7,18, 3);
  81 @sigma1=(17,19,10);
  82 $rounds=64;
  83
  84 ########################################################################
  85 # void aesni_cbc_sha256_enc(const void *inp,
  86 #                       void *out,
  87 #                       size_t length,
  88 #                       const AES_KEY *key,
  89 #                       unsigned char *iv,
  90 #                       SHA256_CTX *ctx,
  91 #                       const void *in0);
  92 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
  93 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  94
  95 $Tbl="%rbp";
  96
  97 $_inp="16*$SZ+0*8(%rsp)";
  98 $_out="16*$SZ+1*8(%rsp)";
  99 $_end="16*$SZ+2*8(%rsp)";
 100 $_key="16*$SZ+3*8(%rsp)";
 101 $_ivp="16*$SZ+4*8(%rsp)";
 102 $_ctx="16*$SZ+5*8(%rsp)";
 103 $_in0="16*$SZ+6*8(%rsp)";
 104 $_rsp="16*$SZ+7*8(%rsp)";
 105 $framesz=16*$SZ+8*8;
 106
 107 $code=<<___;
 108 .text
 109
 110 .extern OPENSSL_ia32cap_P
 111 .globl  $func
 112 .type   $func,\@abi-omnipotent
 113 .align  16
 114 $func:
 115 ___
 116                                                 if ($avx) {
 117 $code.=<<___;
 118         lea     OPENSSL_ia32cap_P(%rip),%r11
 119         mov     \$1,%eax
 120         cmp     \$0,`$win64?"%rcx":"%rdi"`
 121         je      .Lprobe
 122         mov     0(%r11),%eax
 123         mov     4(%r11),%r10
 124 ___
 125 $code.=<<___ if ($shaext);
 126         bt      \$61,%r10                       # check for SHA
 127         jc      ${func}_shaext
 128 ___
 129 $code.=<<___;
 130         mov     %r10,%r11
 131         shr     \$32,%r11
 132
 133         test    \$`1<<11`,%r10d                 # check for XOP
 134         jnz     ${func}_xop
 135 ___
 136 $code.=<<___ if ($avx>1);
 137         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
 138         cmp     \$`1<<8|1<<5|1<<3`,%r11d
 139         je      ${func}_avx2
 140 ___
 141 $code.=<<___;
 142         and     \$`1<<28`,%r10d                 # check for AVX
 143         jnz     ${func}_avx
 144         ud2
 145 ___
 146                                                 }
 147 $code.=<<___;
 148         xor     %eax,%eax
 149         cmp     \$0,`$win64?"%rcx":"%rdi"`
 150         je      .Lprobe
 151         ud2
 152 .Lprobe:
 153         ret
 154 .size   $func,.-$func
 155
 156 .align  64
 157 .type   $TABLE,\@object
 158 $TABLE:
 159         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 160         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 161         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 162         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 163         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 164         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 165         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 166         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 167         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 168         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 169         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 170         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 171         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 172         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 173         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 174         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 175         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 176         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 177         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 178         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 179         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 180         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 181         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 182         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 183         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 184         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 185         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 186         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 187         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 188         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 189         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 190         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 191
 192         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 193         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 194         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 195         .long   0,0,0,0,   0,0,0,0
 196         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 197 .align  64
 198 ___
 199
 200 ######################################################################
 201 # SIMD code paths
 202 #
 203 {{{
 204 ($iv,$inout,$roundkey,$temp,
 205  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
 206
 207 $aesni_cbc_idx=0;
 208 @aesni_cbc_block = (
 209 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
 210 ##      &vmovdqu        ($inout,($inp));
 211 ##      &mov            ($_inp,$inp);
 212
 213         '&vpxor         ($inout,$inout,$roundkey);'.
 214         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
 215
 216         '&vpxor         ($inout,$inout,$iv);',
 217
 218         '&vaesenc       ($inout,$inout,$roundkey);'.
 219         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
 220
 221         '&vaesenc       ($inout,$inout,$roundkey);'.
 222         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
 223
 224         '&vaesenc       ($inout,$inout,$roundkey);'.
 225         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
 226
 227         '&vaesenc       ($inout,$inout,$roundkey);'.
 228         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
 229
 230         '&vaesenc       ($inout,$inout,$roundkey);'.
 231         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
 232
 233         '&vaesenc       ($inout,$inout,$roundkey);'.
 234         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
 235
 236         '&vaesenc       ($inout,$inout,$roundkey);'.
 237         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
 238
 239         '&vaesenc       ($inout,$inout,$roundkey);'.
 240         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
 241
 242         '&vaesenc       ($inout,$inout,$roundkey);'.
 243         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
 244
 245         '&vaesenclast   ($temp,$inout,$roundkey);'.
 246         ' &vaesenc      ($inout,$inout,$roundkey);'.
 247         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
 248
 249         '&vpand         ($iv,$temp,$mask10);'.
 250         ' &vaesenc      ($inout,$inout,$roundkey);'.
 251         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
 252
 253         '&vaesenclast   ($temp,$inout,$roundkey);'.
 254         ' &vaesenc      ($inout,$inout,$roundkey);'.
 255         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
 256
 257         '&vpand         ($temp,$temp,$mask12);'.
 258         ' &vaesenc      ($inout,$inout,$roundkey);'.
 259          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
 260
 261         '&vpor          ($iv,$iv,$temp);'.
 262         ' &vaesenclast  ($temp,$inout,$roundkey);'.
 263         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
 264
 265 ##      &mov            ($inp,$_inp);
 266 ##      &mov            ($out,$_out);
 267 ##      &vpand          ($temp,$temp,$mask14);
 268 ##      &vpor           ($iv,$iv,$temp);
 269 ##      &vmovdqu        ($iv,($out,$inp);
 270 ##      &lea            (inp,16($inp));
 271 );
 272
 273 my $a4=$T1;
 274 my ($a,$b,$c,$d,$e,$f,$g,$h);
 275
 276 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 277 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 278   my $arg = pop;
 279     $arg = "\$$arg" if ($arg*1 eq $arg);
 280     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 281 }
 282
 283 sub body_00_15 () {
 284         (
 285         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 286
 287         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
 288         '&mov   ($a,$a1)',
 289         '&mov   ($a4,$f)',
 290
 291         '&xor   ($a0,$e)',
 292         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
 293         '&xor   ($a4,$g)',                      # f^g
 294
 295         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
 296         '&xor   ($a1,$a)',
 297         '&and   ($a4,$e)',                      # (f^g)&e
 298
 299         @aesni_cbc_block[$aesni_cbc_idx++].
 300         '&xor   ($a0,$e)',
 301         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
 302         '&mov   ($a2,$a)',
 303
 304         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
 305         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
 306         '&xor   ($a2,$b)',                      # a^b, b^c in next round
 307
 308         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
 309         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
 310         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
 311
 312         '&xor   ($a1,$a)',
 313         '&add   ($h,$a0)',                      # h+=Sigma1(e)
 314         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
 315
 316         '&add   ($d,$h)',                       # d+=h
 317         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
 318         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
 319
 320         '&mov   ($a0,$d)',
 321         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
 322         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 323         );
 324 }
 325
 326 if ($avx) {{
 327 ######################################################################
 328 # XOP code path
 329 #
 330 $code.=<<___;
 331 .type   ${func}_xop,\@function,6
 332 .align  64
 333 ${func}_xop:
 334 .Lxop_shortcut:
 335         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 336         push    %rbx
 337         push    %rbp
 338         push    %r12
 339         push    %r13
 340         push    %r14
 341         push    %r15
 342         mov     %rsp,%r11               # copy %rsp
 343         sub     \$`$framesz+$win64*16*10`,%rsp
 344         and     \$-64,%rsp              # align stack frame
 345
 346         shl     \$6,$len
 347         sub     $inp,$out               # re-bias
 348         sub     $inp,$in0
 349         add     $inp,$len               # end of input
 350
 351         #mov    $inp,$_inp              # saved later
 352         mov     $out,$_out
 353         mov     $len,$_end
 354         #mov    $key,$_key              # remains resident in $inp register
 355         mov     $ivp,$_ivp
 356         mov     $ctx,$_ctx
 357         mov     $in0,$_in0
 358         mov     %r11,$_rsp
 359 ___
 360 $code.=<<___ if ($win64);
 361         movaps  %xmm6,`$framesz+16*0`(%rsp)
 362         movaps  %xmm7,`$framesz+16*1`(%rsp)
 363         movaps  %xmm8,`$framesz+16*2`(%rsp)
 364         movaps  %xmm9,`$framesz+16*3`(%rsp)
 365         movaps  %xmm10,`$framesz+16*4`(%rsp)
 366         movaps  %xmm11,`$framesz+16*5`(%rsp)
 367         movaps  %xmm12,`$framesz+16*6`(%rsp)
 368         movaps  %xmm13,`$framesz+16*7`(%rsp)
 369         movaps  %xmm14,`$framesz+16*8`(%rsp)
 370         movaps  %xmm15,`$framesz+16*9`(%rsp)
 371 ___
 372 $code.=<<___;
 373 .Lprologue_xop:
 374         vzeroall
 375
 376         mov     $inp,%r12               # borrow $a4
 377         lea     0x80($key),$inp         # size optimization, reassign
 378         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 379         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 380         mov     $ctx,%r15               # borrow $a2
 381         mov     $in0,%rsi               # borrow $a3
 382         vmovdqu ($ivp),$iv              # load IV
 383         sub     \$9,%r14
 384
 385         mov     $SZ*0(%r15),$A
 386         mov     $SZ*1(%r15),$B
 387         mov     $SZ*2(%r15),$C
 388         mov     $SZ*3(%r15),$D
 389         mov     $SZ*4(%r15),$E
 390         mov     $SZ*5(%r15),$F
 391         mov     $SZ*6(%r15),$G
 392         mov     $SZ*7(%r15),$H
 393
 394         vmovdqa 0x00(%r13,%r14,8),$mask14
 395         vmovdqa 0x10(%r13,%r14,8),$mask12
 396         vmovdqa 0x20(%r13,%r14,8),$mask10
 397         vmovdqu 0x00-0x80($inp),$roundkey
 398         jmp     .Lloop_xop
 399 ___
 400                                         if ($SZ==4) {   # SHA256
 401     my @X = map("%xmm$_",(0..3));
 402     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 403
 404 $code.=<<___;
 405 .align  16
 406 .Lloop_xop:
 407         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 408         vmovdqu 0x00(%rsi,%r12),@X[0]
 409         vmovdqu 0x10(%rsi,%r12),@X[1]
 410         vmovdqu 0x20(%rsi,%r12),@X[2]
 411         vmovdqu 0x30(%rsi,%r12),@X[3]
 412         vpshufb $t3,@X[0],@X[0]
 413         lea     $TABLE(%rip),$Tbl
 414         vpshufb $t3,@X[1],@X[1]
 415         vpshufb $t3,@X[2],@X[2]
 416         vpaddd  0x00($Tbl),@X[0],$t0
 417         vpshufb $t3,@X[3],@X[3]
 418         vpaddd  0x20($Tbl),@X[1],$t1
 419         vpaddd  0x40($Tbl),@X[2],$t2
 420         vpaddd  0x60($Tbl),@X[3],$t3
 421         vmovdqa $t0,0x00(%rsp)
 422         mov     $A,$a1
 423         vmovdqa $t1,0x10(%rsp)
 424         mov     $B,$a3
 425         vmovdqa $t2,0x20(%rsp)
 426         xor     $C,$a3                  # magic
 427         vmovdqa $t3,0x30(%rsp)
 428         mov     $E,$a0
 429         jmp     .Lxop_00_47
 430
 431 .align  16
 432 .Lxop_00_47:
 433         sub     \$-16*2*$SZ,$Tbl        # size optimization
 434         vmovdqu (%r12),$inout           # $a4
 435         mov     %r12,$_inp              # $a4
 436 ___
 437 sub XOP_256_00_47 () {
 438 my $j = shift;
 439 my $body = shift;
 440 my @X = @_;
 441 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 442
 443         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
 444           eval(shift(@insns));
 445           eval(shift(@insns));
 446          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
 447           eval(shift(@insns));
 448           eval(shift(@insns));
 449         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
 450           eval(shift(@insns));
 451           eval(shift(@insns));
 452         &vpsrld         ($t0,$t0,$sigma0[2]);
 453           eval(shift(@insns));
 454           eval(shift(@insns));
 455          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
 456           eval(shift(@insns));
 457           eval(shift(@insns));
 458           eval(shift(@insns));
 459           eval(shift(@insns));
 460         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
 461           eval(shift(@insns));
 462           eval(shift(@insns));
 463         &vpxor          ($t0,$t0,$t1);
 464           eval(shift(@insns));
 465           eval(shift(@insns));
 466           eval(shift(@insns));
 467           eval(shift(@insns));
 468          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
 469           eval(shift(@insns));
 470           eval(shift(@insns));
 471         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
 472           eval(shift(@insns));
 473           eval(shift(@insns));
 474          &vpsrld        ($t2,@X[3],$sigma1[2]);
 475           eval(shift(@insns));
 476           eval(shift(@insns));
 477         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
 478           eval(shift(@insns));
 479           eval(shift(@insns));
 480          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 481           eval(shift(@insns));
 482           eval(shift(@insns));
 483          &vpxor         ($t3,$t3,$t2);
 484           eval(shift(@insns));
 485           eval(shift(@insns));
 486           eval(shift(@insns));
 487           eval(shift(@insns));
 488          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
 489           eval(shift(@insns));
 490           eval(shift(@insns));
 491           eval(shift(@insns));
 492           eval(shift(@insns));
 493         &vpsrldq        ($t3,$t3,8);
 494           eval(shift(@insns));
 495           eval(shift(@insns));
 496           eval(shift(@insns));
 497           eval(shift(@insns));
 498         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
 499           eval(shift(@insns));
 500           eval(shift(@insns));
 501           eval(shift(@insns));
 502           eval(shift(@insns));
 503          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
 504           eval(shift(@insns));
 505           eval(shift(@insns));
 506          &vpsrld        ($t2,@X[0],$sigma1[2]);
 507           eval(shift(@insns));
 508           eval(shift(@insns));
 509          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 510           eval(shift(@insns));
 511           eval(shift(@insns));
 512          &vpxor         ($t3,$t3,$t2);
 513           eval(shift(@insns));
 514           eval(shift(@insns));
 515           eval(shift(@insns));
 516           eval(shift(@insns));
 517          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
 518           eval(shift(@insns));
 519           eval(shift(@insns));
 520           eval(shift(@insns));
 521           eval(shift(@insns));
 522         &vpslldq        ($t3,$t3,8);            # 22 instructions
 523           eval(shift(@insns));
 524           eval(shift(@insns));
 525           eval(shift(@insns));
 526           eval(shift(@insns));
 527         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
 528           eval(shift(@insns));
 529           eval(shift(@insns));
 530           eval(shift(@insns));
 531           eval(shift(@insns));
 532         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 533           foreach (@insns) { eval; }            # remaining instructions
 534         &vmovdqa        (16*$j."(%rsp)",$t2);
 535 }
 536
 537     $aesni_cbc_idx=0;
 538     for ($i=0,$j=0; $j<4; $j++) {
 539         &XOP_256_00_47($j,\&body_00_15,@X);
 540         push(@X,shift(@X));                     # rotate(@X)
 541     }
 542         &mov            ("%r12",$_inp);         # borrow $a4
 543         &vpand          ($temp,$temp,$mask14);
 544         &mov            ("%r15",$_out);         # borrow $a2
 545         &vpor           ($iv,$iv,$temp);
 546         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 547         &lea            ("%r12","16(%r12)");    # inp++
 548
 549         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 550         &jne    (".Lxop_00_47");
 551
 552         &vmovdqu        ($inout,"(%r12)");
 553         &mov            ($_inp,"%r12");
 554
 555     $aesni_cbc_idx=0;
 556     for ($i=0; $i<16; ) {
 557         foreach(body_00_15()) { eval; }
 558     }
 559                                         }
 560 $code.=<<___;
 561         mov     $_inp,%r12              # borrow $a4
 562         mov     $_out,%r13              # borrow $a0
 563         mov     $_ctx,%r15              # borrow $a2
 564         mov     $_in0,%rsi              # borrow $a3
 565
 566         vpand   $mask14,$temp,$temp
 567         mov     $a1,$A
 568         vpor    $temp,$iv,$iv
 569         vmovdqu $iv,(%r13,%r12)         # write output
 570         lea     16(%r12),%r12           # inp++
 571
 572         add     $SZ*0(%r15),$A
 573         add     $SZ*1(%r15),$B
 574         add     $SZ*2(%r15),$C
 575         add     $SZ*3(%r15),$D
 576         add     $SZ*4(%r15),$E
 577         add     $SZ*5(%r15),$F
 578         add     $SZ*6(%r15),$G
 579         add     $SZ*7(%r15),$H
 580
 581         cmp     $_end,%r12
 582
 583         mov     $A,$SZ*0(%r15)
 584         mov     $B,$SZ*1(%r15)
 585         mov     $C,$SZ*2(%r15)
 586         mov     $D,$SZ*3(%r15)
 587         mov     $E,$SZ*4(%r15)
 588         mov     $F,$SZ*5(%r15)
 589         mov     $G,$SZ*6(%r15)
 590         mov     $H,$SZ*7(%r15)
 591
 592         jb      .Lloop_xop
 593
 594         mov     $_ivp,$ivp
 595         mov     $_rsp,%rsi
 596         vmovdqu $iv,($ivp)              # output IV
 597         vzeroall
 598 ___
 599 $code.=<<___ if ($win64);
 600         movaps  `$framesz+16*0`(%rsp),%xmm6
 601         movaps  `$framesz+16*1`(%rsp),%xmm7
 602         movaps  `$framesz+16*2`(%rsp),%xmm8
 603         movaps  `$framesz+16*3`(%rsp),%xmm9
 604         movaps  `$framesz+16*4`(%rsp),%xmm10
 605         movaps  `$framesz+16*5`(%rsp),%xmm11
 606         movaps  `$framesz+16*6`(%rsp),%xmm12
 607         movaps  `$framesz+16*7`(%rsp),%xmm13
 608         movaps  `$framesz+16*8`(%rsp),%xmm14
 609         movaps  `$framesz+16*9`(%rsp),%xmm15
 610 ___
 611 $code.=<<___;
 612         mov     (%rsi),%r15
 613         mov     8(%rsi),%r14
 614         mov     16(%rsi),%r13
 615         mov     24(%rsi),%r12
 616         mov     32(%rsi),%rbp
 617         mov     40(%rsi),%rbx
 618         lea     48(%rsi),%rsp
 619 .Lepilogue_xop:
 620         ret
 621 .size   ${func}_xop,.-${func}_xop
 622 ___
 623 ######################################################################
 624 # AVX+shrd code path
 625 #
 626 local *ror = sub { &shrd(@_[0],@_) };
 627
 628 $code.=<<___;
 629 .type   ${func}_avx,\@function,6
 630 .align  64
 631 ${func}_avx:
 632 .Lavx_shortcut:
 633         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 634         push    %rbx
 635         push    %rbp
 636         push    %r12
 637         push    %r13
 638         push    %r14
 639         push    %r15
 640         mov     %rsp,%r11               # copy %rsp
 641         sub     \$`$framesz+$win64*16*10`,%rsp
 642         and     \$-64,%rsp              # align stack frame
 643
 644         shl     \$6,$len
 645         sub     $inp,$out               # re-bias
 646         sub     $inp,$in0
 647         add     $inp,$len               # end of input
 648
 649         #mov    $inp,$_inp              # saved later
 650         mov     $out,$_out
 651         mov     $len,$_end
 652         #mov    $key,$_key              # remains resident in $inp register
 653         mov     $ivp,$_ivp
 654         mov     $ctx,$_ctx
 655         mov     $in0,$_in0
 656         mov     %r11,$_rsp
 657 ___
 658 $code.=<<___ if ($win64);
 659         movaps  %xmm6,`$framesz+16*0`(%rsp)
 660         movaps  %xmm7,`$framesz+16*1`(%rsp)
 661         movaps  %xmm8,`$framesz+16*2`(%rsp)
 662         movaps  %xmm9,`$framesz+16*3`(%rsp)
 663         movaps  %xmm10,`$framesz+16*4`(%rsp)
 664         movaps  %xmm11,`$framesz+16*5`(%rsp)
 665         movaps  %xmm12,`$framesz+16*6`(%rsp)
 666         movaps  %xmm13,`$framesz+16*7`(%rsp)
 667         movaps  %xmm14,`$framesz+16*8`(%rsp)
 668         movaps  %xmm15,`$framesz+16*9`(%rsp)
 669 ___
 670 $code.=<<___;
 671 .Lprologue_avx:
 672         vzeroall
 673
 674         mov     $inp,%r12               # borrow $a4
 675         lea     0x80($key),$inp         # size optimization, reassign
 676         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 677         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 678         mov     $ctx,%r15               # borrow $a2
 679         mov     $in0,%rsi               # borrow $a3
 680         vmovdqu ($ivp),$iv              # load IV
 681         sub     \$9,%r14
 682
 683         mov     $SZ*0(%r15),$A
 684         mov     $SZ*1(%r15),$B
 685         mov     $SZ*2(%r15),$C
 686         mov     $SZ*3(%r15),$D
 687         mov     $SZ*4(%r15),$E
 688         mov     $SZ*5(%r15),$F
 689         mov     $SZ*6(%r15),$G
 690         mov     $SZ*7(%r15),$H
 691
 692         vmovdqa 0x00(%r13,%r14,8),$mask14
 693         vmovdqa 0x10(%r13,%r14,8),$mask12
 694         vmovdqa 0x20(%r13,%r14,8),$mask10
 695         vmovdqu 0x00-0x80($inp),$roundkey
 696 ___
 697                                         if ($SZ==4) {   # SHA256
 698     my @X = map("%xmm$_",(0..3));
 699     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 700
 701 $code.=<<___;
 702         jmp     .Lloop_avx
 703 .align  16
 704 .Lloop_avx:
 705         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 706         vmovdqu 0x00(%rsi,%r12),@X[0]
 707         vmovdqu 0x10(%rsi,%r12),@X[1]
 708         vmovdqu 0x20(%rsi,%r12),@X[2]
 709         vmovdqu 0x30(%rsi,%r12),@X[3]
 710         vpshufb $t3,@X[0],@X[0]
 711         lea     $TABLE(%rip),$Tbl
 712         vpshufb $t3,@X[1],@X[1]
 713         vpshufb $t3,@X[2],@X[2]
 714         vpaddd  0x00($Tbl),@X[0],$t0
 715         vpshufb $t3,@X[3],@X[3]
 716         vpaddd  0x20($Tbl),@X[1],$t1
 717         vpaddd  0x40($Tbl),@X[2],$t2
 718         vpaddd  0x60($Tbl),@X[3],$t3
 719         vmovdqa $t0,0x00(%rsp)
 720         mov     $A,$a1
 721         vmovdqa $t1,0x10(%rsp)
 722         mov     $B,$a3
 723         vmovdqa $t2,0x20(%rsp)
 724         xor     $C,$a3                  # magic
 725         vmovdqa $t3,0x30(%rsp)
 726         mov     $E,$a0
 727         jmp     .Lavx_00_47
 728
 729 .align  16
 730 .Lavx_00_47:
 731         sub     \$-16*2*$SZ,$Tbl        # size optimization
 732         vmovdqu (%r12),$inout           # $a4
 733         mov     %r12,$_inp              # $a4
 734 ___
 735 sub Xupdate_256_AVX () {
 736         (
 737         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
 738          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
 739         '&vpsrld        ($t2,$t0,$sigma0[0]);',
 740          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
 741         '&vpsrld        ($t3,$t0,$sigma0[2])',
 742         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
 743         '&vpxor         ($t0,$t3,$t2)',
 744          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
 745         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
 746         '&vpxor         ($t0,$t0,$t1)',
 747         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
 748         '&vpxor         ($t0,$t0,$t2)',
 749          '&vpsrld       ($t2,$t3,$sigma1[2]);',
 750         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
 751          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
 752         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
 753          '&vpxor        ($t2,$t2,$t3);',
 754          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 755          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
 756          '&vpshufd      ($t2,$t2,0b10000100)',
 757          '&vpsrldq      ($t2,$t2,8)',
 758         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
 759          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
 760          '&vpsrld       ($t2,$t3,$sigma1[2])',
 761          '&vpsrlq       ($t3,$t3,$sigma1[0])',
 762          '&vpxor        ($t2,$t2,$t3);',
 763          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 764          '&vpxor        ($t2,$t2,$t3)',
 765          '&vpshufd      ($t2,$t2,0b11101000)',
 766          '&vpslldq      ($t2,$t2,8)',
 767         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
 768         );
 769 }
 770
 771 sub AVX_256_00_47 () {
 772 my $j = shift;
 773 my $body = shift;
 774 my @X = @_;
 775 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 776
 777         foreach (Xupdate_256_AVX()) {           # 29 instructions
 778             eval;
 779             eval(shift(@insns));
 780             eval(shift(@insns));
 781             eval(shift(@insns));
 782         }
 783         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 784           foreach (@insns) { eval; }            # remaining instructions
 785         &vmovdqa        (16*$j."(%rsp)",$t2);
 786 }
 787
 788     $aesni_cbc_idx=0;
 789     for ($i=0,$j=0; $j<4; $j++) {
 790         &AVX_256_00_47($j,\&body_00_15,@X);
 791         push(@X,shift(@X));                     # rotate(@X)
 792     }
 793         &mov            ("%r12",$_inp);         # borrow $a4
 794         &vpand          ($temp,$temp,$mask14);
 795         &mov            ("%r15",$_out);         # borrow $a2
 796         &vpor           ($iv,$iv,$temp);
 797         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 798         &lea            ("%r12","16(%r12)");    # inp++
 799
 800         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 801         &jne    (".Lavx_00_47");
 802
 803         &vmovdqu        ($inout,"(%r12)");
 804         &mov            ($_inp,"%r12");
 805
 806     $aesni_cbc_idx=0;
 807     for ($i=0; $i<16; ) {
 808         foreach(body_00_15()) { eval; }
 809     }
 810
 811                                         }
 812 $code.=<<___;
 813         mov     $_inp,%r12              # borrow $a4
 814         mov     $_out,%r13              # borrow $a0
 815         mov     $_ctx,%r15              # borrow $a2
 816         mov     $_in0,%rsi              # borrow $a3
 817
 818         vpand   $mask14,$temp,$temp
 819         mov     $a1,$A
 820         vpor    $temp,$iv,$iv
 821         vmovdqu $iv,(%r13,%r12)         # write output
 822         lea     16(%r12),%r12           # inp++
 823
 824         add     $SZ*0(%r15),$A
 825         add     $SZ*1(%r15),$B
 826         add     $SZ*2(%r15),$C
 827         add     $SZ*3(%r15),$D
 828         add     $SZ*4(%r15),$E
 829         add     $SZ*5(%r15),$F
 830         add     $SZ*6(%r15),$G
 831         add     $SZ*7(%r15),$H
 832
 833         cmp     $_end,%r12
 834
 835         mov     $A,$SZ*0(%r15)
 836         mov     $B,$SZ*1(%r15)
 837         mov     $C,$SZ*2(%r15)
 838         mov     $D,$SZ*3(%r15)
 839         mov     $E,$SZ*4(%r15)
 840         mov     $F,$SZ*5(%r15)
 841         mov     $G,$SZ*6(%r15)
 842         mov     $H,$SZ*7(%r15)
 843         jb      .Lloop_avx
 844
 845         mov     $_ivp,$ivp
 846         mov     $_rsp,%rsi
 847         vmovdqu $iv,($ivp)              # output IV
 848         vzeroall
 849 ___
 850 $code.=<<___ if ($win64);
 851         movaps  `$framesz+16*0`(%rsp),%xmm6
 852         movaps  `$framesz+16*1`(%rsp),%xmm7
 853         movaps  `$framesz+16*2`(%rsp),%xmm8
 854         movaps  `$framesz+16*3`(%rsp),%xmm9
 855         movaps  `$framesz+16*4`(%rsp),%xmm10
 856         movaps  `$framesz+16*5`(%rsp),%xmm11
 857         movaps  `$framesz+16*6`(%rsp),%xmm12
 858         movaps  `$framesz+16*7`(%rsp),%xmm13
 859         movaps  `$framesz+16*8`(%rsp),%xmm14
 860         movaps  `$framesz+16*9`(%rsp),%xmm15
 861 ___
 862 $code.=<<___;
 863         mov     (%rsi),%r15
 864         mov     8(%rsi),%r14
 865         mov     16(%rsi),%r13
 866         mov     24(%rsi),%r12
 867         mov     32(%rsi),%rbp
 868         mov     40(%rsi),%rbx
 869         lea     48(%rsi),%rsp
 870 .Lepilogue_avx:
 871         ret
 872 .size   ${func}_avx,.-${func}_avx
 873 ___
 874
 875 if ($avx>1) {{
 876 ######################################################################
 877 # AVX2+BMI code path
 878 #
 879 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
 880 my $PUSH8=8*2*$SZ;
 881 use integer;
 882
 883 sub bodyx_00_15 () {
 884         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 885         (
 886         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 887
 888         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 889         '&and   ($a4,$e)',              # f&e
 890         '&rorx  ($a0,$e,$Sigma1[2])',
 891         '&rorx  ($a2,$e,$Sigma1[1])',
 892
 893         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
 894         '&lea   ($h,"($h,$a4)")',
 895         '&andn  ($a4,$e,$g)',           # ~e&g
 896         '&xor   ($a0,$a2)',
 897
 898         '&rorx  ($a1,$e,$Sigma1[0])',
 899         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
 900         '&xor   ($a0,$a1)',             # Sigma1(e)
 901         '&mov   ($a2,$a)',
 902
 903         '&rorx  ($a4,$a,$Sigma0[2])',
 904         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
 905         '&xor   ($a2,$b)',              # a^b, b^c in next round
 906         '&rorx  ($a1,$a,$Sigma0[1])',
 907
 908         '&rorx  ($a0,$a,$Sigma0[0])',
 909         '&lea   ($d,"($d,$h)")',        # d+=h
 910         '&and   ($a3,$a2)',             # (b^c)&(a^b)
 911         @aesni_cbc_block[$aesni_cbc_idx++].
 912         '&xor   ($a1,$a4)',
 913
 914         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
 915         '&xor   ($a1,$a0)',             # Sigma0(a)
 916         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
 917         '&mov   ($a4,$e)',              # copy of f in future
 918
 919         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 920         );
 921         # and at the finish one has to $a+=$a1
 922 }
 923
 924 $code.=<<___;
 925 .type   ${func}_avx2,\@function,6
 926 .align  64
 927 ${func}_avx2:
 928 .Lavx2_shortcut:
 929         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 930         push    %rbx
 931         push    %rbp
 932         push    %r12
 933         push    %r13
 934         push    %r14
 935         push    %r15
 936         mov     %rsp,%r11               # copy %rsp
 937         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 938         and     \$-256*$SZ,%rsp         # align stack frame
 939         add     \$`2*$SZ*($rounds-8)`,%rsp
 940
 941         shl     \$6,$len
 942         sub     $inp,$out               # re-bias
 943         sub     $inp,$in0
 944         add     $inp,$len               # end of input
 945
 946         #mov    $inp,$_inp              # saved later
 947         #mov    $out,$_out              # kept in $offload
 948         mov     $len,$_end
 949         #mov    $key,$_key              # remains resident in $inp register
 950         mov     $ivp,$_ivp
 951         mov     $ctx,$_ctx
 952         mov     $in0,$_in0
 953         mov     %r11,$_rsp
 954 ___
 955 $code.=<<___ if ($win64);
 956         movaps  %xmm6,`$framesz+16*0`(%rsp)
 957         movaps  %xmm7,`$framesz+16*1`(%rsp)
 958         movaps  %xmm8,`$framesz+16*2`(%rsp)
 959         movaps  %xmm9,`$framesz+16*3`(%rsp)
 960         movaps  %xmm10,`$framesz+16*4`(%rsp)
 961         movaps  %xmm11,`$framesz+16*5`(%rsp)
 962         movaps  %xmm12,`$framesz+16*6`(%rsp)
 963         movaps  %xmm13,`$framesz+16*7`(%rsp)
 964         movaps  %xmm14,`$framesz+16*8`(%rsp)
 965         movaps  %xmm15,`$framesz+16*9`(%rsp)
 966 ___
 967 $code.=<<___;
 968 .Lprologue_avx2:
 969         vzeroall
 970
 971         mov     $inp,%r13               # borrow $a0
 972         vpinsrq \$1,$out,$offload,$offload
 973         lea     0x80($key),$inp         # size optimization, reassign
 974         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
 975         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 976         mov     $ctx,%r15               # borrow $a2
 977         mov     $in0,%rsi               # borrow $a3
 978         vmovdqu ($ivp),$iv              # load IV
 979         lea     -9(%r14),%r14
 980
 981         vmovdqa 0x00(%r12,%r14,8),$mask14
 982         vmovdqa 0x10(%r12,%r14,8),$mask12
 983         vmovdqa 0x20(%r12,%r14,8),$mask10
 984
 985         sub     \$-16*$SZ,%r13          # inp++, size optimization
 986         mov     $SZ*0(%r15),$A
 987         lea     (%rsi,%r13),%r12        # borrow $a0
 988         mov     $SZ*1(%r15),$B
 989         cmp     $len,%r13               # $_end
 990         mov     $SZ*2(%r15),$C
 991         cmove   %rsp,%r12               # next block or random data
 992         mov     $SZ*3(%r15),$D
 993         mov     $SZ*4(%r15),$E
 994         mov     $SZ*5(%r15),$F
 995         mov     $SZ*6(%r15),$G
 996         mov     $SZ*7(%r15),$H
 997         vmovdqu 0x00-0x80($inp),$roundkey
 998 ___
 999                                         if ($SZ==4) {   # SHA256
1000     my @X = map("%ymm$_",(0..3));
1001     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1002
1003 $code.=<<___;
1004         jmp     .Loop_avx2
1005 .align  16
1006 .Loop_avx2:
1007         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1008         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1009         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1010         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1011         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1012
1013         vinserti128     \$1,(%r12),@X[0],@X[0]
1014         vinserti128     \$1,16(%r12),@X[1],@X[1]
1015          vpshufb        $t3,@X[0],@X[0]
1016         vinserti128     \$1,32(%r12),@X[2],@X[2]
1017          vpshufb        $t3,@X[1],@X[1]
1018         vinserti128     \$1,48(%r12),@X[3],@X[3]
1019
1020         lea     $TABLE(%rip),$Tbl
1021         vpshufb $t3,@X[2],@X[2]
1022         lea     -16*$SZ(%r13),%r13
1023         vpaddd  0x00($Tbl),@X[0],$t0
1024         vpshufb $t3,@X[3],@X[3]
1025         vpaddd  0x20($Tbl),@X[1],$t1
1026         vpaddd  0x40($Tbl),@X[2],$t2
1027         vpaddd  0x60($Tbl),@X[3],$t3
1028         vmovdqa $t0,0x00(%rsp)
1029         xor     $a1,$a1
1030         vmovdqa $t1,0x20(%rsp)
1031         lea     -$PUSH8(%rsp),%rsp
1032         mov     $B,$a3
1033         vmovdqa $t2,0x00(%rsp)
1034         xor     $C,$a3                  # magic
1035         vmovdqa $t3,0x20(%rsp)
1036         mov     $F,$a4
1037         sub     \$-16*2*$SZ,$Tbl        # size optimization
1038         jmp     .Lavx2_00_47
1039
1040 .align  16
1041 .Lavx2_00_47:
1042         vmovdqu (%r13),$inout
1043         vpinsrq \$0,%r13,$offload,$offload
1044 ___
1045
1046 sub AVX2_256_00_47 () {
1047 my $j = shift;
1048 my $body = shift;
1049 my @X = @_;
1050 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1051 my $base = "+2*$PUSH8(%rsp)";
1052
1053         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1054         foreach (Xupdate_256_AVX()) {           # 29 instructions
1055             eval;
1056             eval(shift(@insns));
1057             eval(shift(@insns));
1058             eval(shift(@insns));
1059         }
1060         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1061           foreach (@insns) { eval; }            # remaining instructions
1062         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1063 }
1064     $aesni_cbc_idx=0;
1065     for ($i=0,$j=0; $j<4; $j++) {
1066         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1067         push(@X,shift(@X));                     # rotate(@X)
1068     }
1069         &vmovq          ("%r13",$offload);      # borrow $a0
1070         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1071         &vpand          ($temp,$temp,$mask14);
1072         &vpor           ($iv,$iv,$temp);
1073         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1074         &lea            ("%r13","16(%r13)");    # inp++
1075
1076         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1077         &cmpb   (($SZ-1)."($Tbl)",0);
1078         &jne    (".Lavx2_00_47");
1079
1080         &vmovdqu        ($inout,"(%r13)");
1081         &vpinsrq        ($offload,$offload,"%r13",0);
1082
1083     $aesni_cbc_idx=0;
1084     for ($i=0; $i<16; ) {
1085         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1086         foreach(bodyx_00_15()) { eval; }
1087     }
1088                                         }
1089 $code.=<<___;
1090         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1091         vmovq   $offload,%r13                   # $_inp, borrow $a0
1092         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1093         add     $a1,$A
1094         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1095
1096         vpand   $mask14,$temp,$temp
1097         vpor    $temp,$iv,$iv
1098         vmovdqu $iv,(%r12,%r13)                 # write output
1099         lea     16(%r13),%r13
1100
1101         add     $SZ*0(%r15),$A
1102         add     $SZ*1(%r15),$B
1103         add     $SZ*2(%r15),$C
1104         add     $SZ*3(%r15),$D
1105         add     $SZ*4(%r15),$E
1106         add     $SZ*5(%r15),$F
1107         add     $SZ*6(%r15),$G
1108         add     $SZ*7(%r15),$H
1109
1110         mov     $A,$SZ*0(%r15)
1111         mov     $B,$SZ*1(%r15)
1112         mov     $C,$SZ*2(%r15)
1113         mov     $D,$SZ*3(%r15)
1114         mov     $E,$SZ*4(%r15)
1115         mov     $F,$SZ*5(%r15)
1116         mov     $G,$SZ*6(%r15)
1117         mov     $H,$SZ*7(%r15)
1118
1119         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1120         je      .Ldone_avx2
1121
1122         xor     $a1,$a1
1123         mov     $B,$a3
1124         mov     $F,$a4
1125         xor     $C,$a3                  # magic
1126         jmp     .Lower_avx2
1127 .align  16
1128 .Lower_avx2:
1129         vmovdqu (%r13),$inout
1130         vpinsrq \$0,%r13,$offload,$offload
1131 ___
1132     $aesni_cbc_idx=0;
1133     for ($i=0; $i<16; ) {
1134         my $base="+16($Tbl)";
1135         foreach(bodyx_00_15()) { eval; }
1136         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1137     }
1138 $code.=<<___;
1139         vmovq   $offload,%r13                   # borrow $a0
1140         vpextrq \$1,$offload,%r15               # borrow $a2
1141         vpand   $mask14,$temp,$temp
1142         vpor    $temp,$iv,$iv
1143         lea     -$PUSH8($Tbl),$Tbl
1144         vmovdqu $iv,(%r15,%r13)                 # write output
1145         lea     16(%r13),%r13                   # inp++
1146         cmp     %rsp,$Tbl
1147         jae     .Lower_avx2
1148
1149         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1150         lea     16*$SZ(%r13),%r13
1151         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1152         add     $a1,$A
1153         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1154
1155         add     $SZ*0(%r15),$A
1156         add     $SZ*1(%r15),$B
1157         add     $SZ*2(%r15),$C
1158         add     $SZ*3(%r15),$D
1159         add     $SZ*4(%r15),$E
1160         add     $SZ*5(%r15),$F
1161         add     $SZ*6(%r15),$G
1162         lea     (%rsi,%r13),%r12
1163         add     $SZ*7(%r15),$H
1164
1165         cmp     $_end,%r13
1166
1167         mov     $A,$SZ*0(%r15)
1168         cmove   %rsp,%r12               # next block or stale data
1169         mov     $B,$SZ*1(%r15)
1170         mov     $C,$SZ*2(%r15)
1171         mov     $D,$SZ*3(%r15)
1172         mov     $E,$SZ*4(%r15)
1173         mov     $F,$SZ*5(%r15)
1174         mov     $G,$SZ*6(%r15)
1175         mov     $H,$SZ*7(%r15)
1176
1177         jbe     .Loop_avx2
1178         lea     (%rsp),$Tbl
1179
1180 .Ldone_avx2:
1181         lea     ($Tbl),%rsp
1182         mov     $_ivp,$ivp
1183         mov     $_rsp,%rsi
1184         vmovdqu $iv,($ivp)              # output IV
1185         vzeroall
1186 ___
1187 $code.=<<___ if ($win64);
1188         movaps  `$framesz+16*0`(%rsp),%xmm6
1189         movaps  `$framesz+16*1`(%rsp),%xmm7
1190         movaps  `$framesz+16*2`(%rsp),%xmm8
1191         movaps  `$framesz+16*3`(%rsp),%xmm9
1192         movaps  `$framesz+16*4`(%rsp),%xmm10
1193         movaps  `$framesz+16*5`(%rsp),%xmm11
1194         movaps  `$framesz+16*6`(%rsp),%xmm12
1195         movaps  `$framesz+16*7`(%rsp),%xmm13
1196         movaps  `$framesz+16*8`(%rsp),%xmm14
1197         movaps  `$framesz+16*9`(%rsp),%xmm15
1198 ___
1199 $code.=<<___;
1200         mov     (%rsi),%r15
1201         mov     8(%rsi),%r14
1202         mov     16(%rsi),%r13
1203         mov     24(%rsi),%r12
1204         mov     32(%rsi),%rbp
1205         mov     40(%rsi),%rbx
1206         lea     48(%rsi),%rsp
1207 .Lepilogue_avx2:
1208         ret
1209 .size   ${func}_avx2,.-${func}_avx2
1210 ___
1211 }}
1212 }}
1213 {{
1214 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1215
1216 my ($rounds,$Tbl)=("%r11d","%rbx");
1217
1218 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1219 my @rndkey=("%xmm4","%xmm5");
1220 my $r=0;
1221 my $sn=0;
1222
1223 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1224 my @MSG=map("%xmm$_",(10..13));
1225
1226 my $aesenc=sub {
1227   use integer;
1228   my ($n,$k)=($r/10,$r%10);
1229     if ($k==0) {
1230       $code.=<<___;
1231         movups          `16*$n`($in0),$in               # load input
1232         xorps           $rndkey0,$in
1233 ___
1234       $code.=<<___ if ($n);
1235         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1236 ___
1237       $code.=<<___;
1238         xorps           $in,$iv
1239         movups          `32+16*$k-112`($key),$rndkey[1]
1240         aesenc          $rndkey[0],$iv
1241 ___
1242     } elsif ($k==9) {
1243       $sn++;
1244       $code.=<<___;
1245         cmp             \$11,$rounds
1246         jb              .Laesenclast$sn
1247         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1248         aesenc          $rndkey[0],$iv
1249         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1250         aesenc          $rndkey[1],$iv
1251         je              .Laesenclast$sn
1252         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1253         aesenc          $rndkey[0],$iv
1254         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1255         aesenc          $rndkey[1],$iv
1256 .Laesenclast$sn:
1257         aesenclast      $rndkey[0],$iv
1258         movups          16-112($key),$rndkey[1]         # forward reference
1259         nop
1260 ___
1261     } else {
1262       $code.=<<___;
1263         movups          `32+16*$k-112`($key),$rndkey[1]
1264         aesenc          $rndkey[0],$iv
1265 ___
1266     }
1267     $r++;       unshift(@rndkey,pop(@rndkey));
1268 };
1269
1270 if ($shaext) {
1271 my $Tbl="%rax";
1272
1273 $code.=<<___;
1274 .type   ${func}_shaext,\@function,6
1275 .align  32
1276 ${func}_shaext:
1277         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1278 ___
1279 $code.=<<___ if ($win64);
1280         lea     `-8-10*16`(%rsp),%rsp
1281         movaps  %xmm6,-8-10*16(%rax)
1282         movaps  %xmm7,-8-9*16(%rax)
1283         movaps  %xmm8,-8-8*16(%rax)
1284         movaps  %xmm9,-8-7*16(%rax)
1285         movaps  %xmm10,-8-6*16(%rax)
1286         movaps  %xmm11,-8-5*16(%rax)
1287         movaps  %xmm12,-8-4*16(%rax)
1288         movaps  %xmm13,-8-3*16(%rax)
1289         movaps  %xmm14,-8-2*16(%rax)
1290         movaps  %xmm15,-8-1*16(%rax)
1291 .Lprologue_shaext:
1292 ___
1293 $code.=<<___;
1294         lea             K256+0x80(%rip),$Tbl
1295         movdqu          ($ctx),$ABEF            # DCBA
1296         movdqu          16($ctx),$CDGH          # HGFE
1297         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1298
1299         mov             240($key),$rounds
1300         sub             $in0,$out
1301         movups          ($key),$rndkey0         # $key[0]
1302         movups          16($key),$rndkey[0]     # forward reference
1303         lea             112($key),$key          # size optimization
1304
1305         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1306         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1307         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1308         movdqa          $TMP,$BSWAP             # offload
1309         palignr         \$8,$CDGH,$ABEF         # ABEF
1310         punpcklqdq      $Wi,$CDGH               # CDGH
1311
1312         jmp     .Loop_shaext
1313
1314 .align  16
1315 .Loop_shaext:
1316         movdqu          ($inp),@MSG[0]
1317         movdqu          0x10($inp),@MSG[1]
1318         movdqu          0x20($inp),@MSG[2]
1319         pshufb          $TMP,@MSG[0]
1320         movdqu          0x30($inp),@MSG[3]
1321
1322         movdqa          0*32-0x80($Tbl),$Wi
1323         paddd           @MSG[0],$Wi
1324         pshufb          $TMP,@MSG[1]
1325         movdqa          $CDGH,$CDGH_SAVE        # offload
1326         movdqa          $ABEF,$ABEF_SAVE        # offload
1327 ___
1328         &$aesenc();
1329 $code.=<<___;
1330         sha256rnds2     $ABEF,$CDGH             # 0-3
1331         pshufd          \$0x0e,$Wi,$Wi
1332 ___
1333         &$aesenc();
1334 $code.=<<___;
1335         sha256rnds2     $CDGH,$ABEF
1336
1337         movdqa          1*32-0x80($Tbl),$Wi
1338         paddd           @MSG[1],$Wi
1339         pshufb          $TMP,@MSG[2]
1340         lea             0x40($inp),$inp
1341 ___
1342         &$aesenc();
1343 $code.=<<___;
1344         sha256rnds2     $ABEF,$CDGH             # 4-7
1345         pshufd          \$0x0e,$Wi,$Wi
1346 ___
1347         &$aesenc();
1348 $code.=<<___;
1349         sha256rnds2     $CDGH,$ABEF
1350
1351         movdqa          2*32-0x80($Tbl),$Wi
1352         paddd           @MSG[2],$Wi
1353         pshufb          $TMP,@MSG[3]
1354         sha256msg1      @MSG[1],@MSG[0]
1355 ___
1356         &$aesenc();
1357 $code.=<<___;
1358         sha256rnds2     $ABEF,$CDGH             # 8-11
1359         pshufd          \$0x0e,$Wi,$Wi
1360         movdqa          @MSG[3],$TMP
1361         palignr         \$4,@MSG[2],$TMP
1362         paddd           $TMP,@MSG[0]
1363 ___
1364         &$aesenc();
1365 $code.=<<___;
1366         sha256rnds2     $CDGH,$ABEF
1367
1368         movdqa          3*32-0x80($Tbl),$Wi
1369         paddd           @MSG[3],$Wi
1370         sha256msg2      @MSG[3],@MSG[0]
1371         sha256msg1      @MSG[2],@MSG[1]
1372 ___
1373         &$aesenc();
1374 $code.=<<___;
1375         sha256rnds2     $ABEF,$CDGH             # 12-15
1376         pshufd          \$0x0e,$Wi,$Wi
1377 ___
1378         &$aesenc();
1379 $code.=<<___;
1380         movdqa          @MSG[0],$TMP
1381         palignr         \$4,@MSG[3],$TMP
1382         paddd           $TMP,@MSG[1]
1383         sha256rnds2     $CDGH,$ABEF
1384 ___
1385 for($i=4;$i<16-3;$i++) {
1386         &$aesenc()      if (($r%10)==0);
1387 $code.=<<___;
1388         movdqa          $i*32-0x80($Tbl),$Wi
1389         paddd           @MSG[0],$Wi
1390         sha256msg2      @MSG[0],@MSG[1]
1391         sha256msg1      @MSG[3],@MSG[2]
1392 ___
1393         &$aesenc();
1394 $code.=<<___;
1395         sha256rnds2     $ABEF,$CDGH             # 16-19...
1396         pshufd          \$0x0e,$Wi,$Wi
1397         movdqa          @MSG[1],$TMP
1398         palignr         \$4,@MSG[0],$TMP
1399         paddd           $TMP,@MSG[2]
1400 ___
1401         &$aesenc();
1402         &$aesenc()      if ($r==19);
1403 $code.=<<___;
1404         sha256rnds2     $CDGH,$ABEF
1405 ___
1406         push(@MSG,shift(@MSG));
1407 }
1408 $code.=<<___;
1409         movdqa          13*32-0x80($Tbl),$Wi
1410         paddd           @MSG[0],$Wi
1411         sha256msg2      @MSG[0],@MSG[1]
1412         sha256msg1      @MSG[3],@MSG[2]
1413 ___
1414         &$aesenc();
1415 $code.=<<___;
1416         sha256rnds2     $ABEF,$CDGH             # 52-55
1417         pshufd          \$0x0e,$Wi,$Wi
1418         movdqa          @MSG[1],$TMP
1419         palignr         \$4,@MSG[0],$TMP
1420         paddd           $TMP,@MSG[2]
1421 ___
1422         &$aesenc();
1423         &$aesenc();
1424 $code.=<<___;
1425         sha256rnds2     $CDGH,$ABEF
1426
1427         movdqa          14*32-0x80($Tbl),$Wi
1428         paddd           @MSG[1],$Wi
1429         sha256msg2      @MSG[1],@MSG[2]
1430         movdqa          $BSWAP,$TMP
1431 ___
1432         &$aesenc();
1433 $code.=<<___;
1434         sha256rnds2     $ABEF,$CDGH             # 56-59
1435         pshufd          \$0x0e,$Wi,$Wi
1436 ___
1437         &$aesenc();
1438 $code.=<<___;
1439         sha256rnds2     $CDGH,$ABEF
1440
1441         movdqa          15*32-0x80($Tbl),$Wi
1442         paddd           @MSG[2],$Wi
1443 ___
1444         &$aesenc();
1445         &$aesenc();
1446 $code.=<<___;
1447         sha256rnds2     $ABEF,$CDGH             # 60-63
1448         pshufd          \$0x0e,$Wi,$Wi
1449 ___
1450         &$aesenc();
1451 $code.=<<___;
1452         sha256rnds2     $CDGH,$ABEF
1453         #pxor           $CDGH,$rndkey0          # black magic
1454 ___
1455         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1456 $code.=<<___;
1457         #xorps          $CDGH,$rndkey0          # black magic
1458         paddd           $CDGH_SAVE,$CDGH
1459         paddd           $ABEF_SAVE,$ABEF
1460
1461         dec             $len
1462         movups          $iv,48($out,$in0)       # write output
1463         lea             64($in0),$in0
1464         jnz             .Loop_shaext
1465
1466         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1467         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1468         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1469         punpckhqdq      $CDGH,$ABEF             # DCBA
1470         palignr         \$8,$TMP,$CDGH          # HGFE
1471
1472         movups          $iv,($ivp)              # write IV
1473         movdqu          $ABEF,($ctx)
1474         movdqu          $CDGH,16($ctx)
1475 ___
1476 $code.=<<___ if ($win64);
1477         movaps  0*16(%rsp),%xmm6
1478         movaps  1*16(%rsp),%xmm7
1479         movaps  2*16(%rsp),%xmm8
1480         movaps  3*16(%rsp),%xmm9
1481         movaps  4*16(%rsp),%xmm10
1482         movaps  5*16(%rsp),%xmm11
1483         movaps  6*16(%rsp),%xmm12
1484         movaps  7*16(%rsp),%xmm13
1485         movaps  8*16(%rsp),%xmm14
1486         movaps  9*16(%rsp),%xmm15
1487         lea     8+10*16(%rsp),%rsp
1488 .Lepilogue_shaext:
1489 ___
1490 $code.=<<___;
1491         ret
1492 .size   ${func}_shaext,.-${func}_shaext
1493 ___
1494 }
1495 }}}}}
1496
1497 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1498 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1499 if ($win64 && $avx) {
1500 $rec="%rcx";
1501 $frame="%rdx";
1502 $context="%r8";
1503 $disp="%r9";
1504
1505 $code.=<<___;
1506 .extern __imp_RtlVirtualUnwind
1507 .type   se_handler,\@abi-omnipotent
1508 .align  16
1509 se_handler:
1510         push    %rsi
1511         push    %rdi
1512         push    %rbx
1513         push    %rbp
1514         push    %r12
1515         push    %r13
1516         push    %r14
1517         push    %r15
1518         pushfq
1519         sub     \$64,%rsp
1520
1521         mov     120($context),%rax      # pull context->Rax
1522         mov     248($context),%rbx      # pull context->Rip
1523
1524         mov     8($disp),%rsi           # disp->ImageBase
1525         mov     56($disp),%r11          # disp->HanderlData
1526
1527         mov     0(%r11),%r10d           # HandlerData[0]
1528         lea     (%rsi,%r10),%r10        # prologue label
1529         cmp     %r10,%rbx               # context->Rip<prologue label
1530         jb      .Lin_prologue
1531
1532         mov     152($context),%rax      # pull context->Rsp
1533
1534         mov     4(%r11),%r10d           # HandlerData[1]
1535         lea     (%rsi,%r10),%r10        # epilogue label
1536         cmp     %r10,%rbx               # context->Rip>=epilogue label
1537         jae     .Lin_prologue
1538 ___
1539 $code.=<<___ if ($shaext);
1540         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1541         cmp     %r10,%rbx
1542         jb      .Lnot_in_shaext
1543
1544         lea     (%rax),%rsi
1545         lea     512($context),%rdi      # &context.Xmm6
1546         mov     \$20,%ecx
1547         .long   0xa548f3fc              # cld; rep movsq
1548         lea     168(%rax),%rax          # adjust stack pointer
1549         jmp     .Lin_prologue
1550 .Lnot_in_shaext:
1551 ___
1552 $code.=<<___ if ($avx>1);
1553         lea     .Lavx2_shortcut(%rip),%r10
1554         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1555         jb      .Lnot_in_avx2
1556
1557         and     \$-256*$SZ,%rax
1558         add     \$`2*$SZ*($rounds-8)`,%rax
1559 .Lnot_in_avx2:
1560 ___
1561 $code.=<<___;
1562         mov     %rax,%rsi               # put aside Rsp
1563         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1564         lea     48(%rax),%rax
1565
1566         mov     -8(%rax),%rbx
1567         mov     -16(%rax),%rbp
1568         mov     -24(%rax),%r12
1569         mov     -32(%rax),%r13
1570         mov     -40(%rax),%r14
1571         mov     -48(%rax),%r15
1572         mov     %rbx,144($context)      # restore context->Rbx
1573         mov     %rbp,160($context)      # restore context->Rbp
1574         mov     %r12,216($context)      # restore context->R12
1575         mov     %r13,224($context)      # restore context->R13
1576         mov     %r14,232($context)      # restore context->R14
1577         mov     %r15,240($context)      # restore context->R15
1578
1579         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1580         lea     512($context),%rdi      # &context.Xmm6
1581         mov     \$20,%ecx
1582         .long   0xa548f3fc              # cld; rep movsq
1583
1584 .Lin_prologue:
1585         mov     8(%rax),%rdi
1586         mov     16(%rax),%rsi
1587         mov     %rax,152($context)      # restore context->Rsp
1588         mov     %rsi,168($context)      # restore context->Rsi
1589         mov     %rdi,176($context)      # restore context->Rdi
1590
1591         mov     40($disp),%rdi          # disp->ContextRecord
1592         mov     $context,%rsi           # context
1593         mov     \$154,%ecx              # sizeof(CONTEXT)
1594         .long   0xa548f3fc              # cld; rep movsq
1595
1596         mov     $disp,%rsi
1597         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1598         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1599         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1600         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1601         mov     40(%rsi),%r10           # disp->ContextRecord
1602         lea     56(%rsi),%r11           # &disp->HandlerData
1603         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1604         mov     %r10,32(%rsp)           # arg5
1605         mov     %r11,40(%rsp)           # arg6
1606         mov     %r12,48(%rsp)           # arg7
1607         mov     %rcx,56(%rsp)           # arg8, (NULL)
1608         call    *__imp_RtlVirtualUnwind(%rip)
1609
1610         mov     \$1,%eax                # ExceptionContinueSearch
1611         add     \$64,%rsp
1612         popfq
1613         pop     %r15
1614         pop     %r14
1615         pop     %r13
1616         pop     %r12
1617         pop     %rbp
1618         pop     %rbx
1619         pop     %rdi
1620         pop     %rsi
1621         ret
1622 .size   se_handler,.-se_handler
1623
1624 .section        .pdata
1625         .rva    .LSEH_begin_${func}_xop
1626         .rva    .LSEH_end_${func}_xop
1627         .rva    .LSEH_info_${func}_xop
1628
1629         .rva    .LSEH_begin_${func}_avx
1630         .rva    .LSEH_end_${func}_avx
1631         .rva    .LSEH_info_${func}_avx
1632 ___
1633 $code.=<<___ if ($avx>1);
1634         .rva    .LSEH_begin_${func}_avx2
1635         .rva    .LSEH_end_${func}_avx2
1636         .rva    .LSEH_info_${func}_avx2
1637 ___
1638 $code.=<<___ if ($shaext);
1639         .rva    .LSEH_begin_${func}_shaext
1640         .rva    .LSEH_end_${func}_shaext
1641         .rva    .LSEH_info_${func}_shaext
1642 ___
1643 $code.=<<___;
1644 .section        .xdata
1645 .align  8
1646 .LSEH_info_${func}_xop:
1647         .byte   9,0,0,0
1648         .rva    se_handler
1649         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1650
1651 .LSEH_info_${func}_avx:
1652         .byte   9,0,0,0
1653         .rva    se_handler
1654         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1655 ___
1656 $code.=<<___ if ($avx>1);
1657 .LSEH_info_${func}_avx2:
1658         .byte   9,0,0,0
1659         .rva    se_handler
1660         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1661 ___
1662 $code.=<<___ if ($shaext);
1663 .LSEH_info_${func}_shaext:
1664         .byte   9,0,0,0
1665         .rva    se_handler
1666         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1667 ___
1668 }
1669
1670 ####################################################################
1671 sub rex {
1672   local *opcode=shift;
1673   my ($dst,$src)=@_;
1674   my $rex=0;
1675
1676     $rex|=0x04                  if($dst>=8);
1677     $rex|=0x01                  if($src>=8);
1678     unshift @opcode,$rex|0x40   if($rex);
1679 }
1680
1681 {
1682   my %opcodelet = (
1683                 "sha256rnds2" => 0xcb,
1684                 "sha256msg1"  => 0xcc,
1685                 "sha256msg2"  => 0xcd   );
1686
1687   sub sha256op38 {
1688     my $instr = shift;
1689
1690     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1691       my @opcode=(0x0f,0x38);
1692         rex(\@opcode,$2,$1);
1693         push @opcode,$opcodelet{$instr};
1694         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1695         return ".byte\t".join(',',@opcode);
1696     } else {
1697         return $instr."\t".@_[0];
1698     }
1699   }
1700 }
1701
1702 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1703 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1704 print $code;
1705 close STDOUT;