OpenSSL 1.0.2f
[tomato.git] / release / src / router / openssl / crypto / aes / asm / aesni-sha256-x86_64.pl
blob72f44ecf6253458d0d0fbd0d140ae67d469c364f
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # January 2013
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
22 # subroutine:
24 # AES-128/-192/-256+SHA256 this(**)gain
25 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
26 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
27 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
28 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
30 # (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
31 # Westmere is omitted from loop, this is because gain was not
32 # estimated high enough to justify the effort;
33 # (**) these are EVP-free results, results obtained with 'speed
34 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
36 $flavour = shift;
37 $output = shift;
38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45 die "can't locate x86_64-xlate.pl";
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49 $avx = ($1>=2.19) + ($1>=2.22);
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.09) + ($1>=2.10);
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59 $avx = ($1>=10) + ($1>=12);
62 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
63 $avx = ($2>=3.0) + ($2>3.0);
66 $shaext=$avx; ### set to zero if compiling for 1.0.1
67 $avx=1 if (!$shaext && $avx);
69 open OUT,"| \"$^X\" $xlate $flavour $output";
70 *STDOUT=*OUT;
72 $func="aesni_cbc_sha256_enc";
73 $TABLE="K256";
74 $SZ=4;
75 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
76 "%r8d","%r9d","%r10d","%r11d");
77 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
78 @Sigma0=( 2,13,22);
79 @Sigma1=( 6,11,25);
80 @sigma0=( 7,18, 3);
81 @sigma1=(17,19,10);
82 $rounds=64;
84 ########################################################################
85 # void aesni_cbc_sha256_enc(const void *inp,
86 # void *out,
87 # size_t length,
88 # const AES_KEY *key,
89 # unsigned char *iv,
90 # SHA256_CTX *ctx,
91 # const void *in0);
92 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
93 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
95 $Tbl="%rbp";
97 $_inp="16*$SZ+0*8(%rsp)";
98 $_out="16*$SZ+1*8(%rsp)";
99 $_end="16*$SZ+2*8(%rsp)";
100 $_key="16*$SZ+3*8(%rsp)";
101 $_ivp="16*$SZ+4*8(%rsp)";
102 $_ctx="16*$SZ+5*8(%rsp)";
103 $_in0="16*$SZ+6*8(%rsp)";
104 $_rsp="16*$SZ+7*8(%rsp)";
105 $framesz=16*$SZ+8*8;
107 $code=<<___;
108 .text
110 .extern OPENSSL_ia32cap_P
111 .globl $func
112 .type $func,\@abi-omnipotent
113 .align 16
114 $func:
116 if ($avx) {
117 $code.=<<___;
118 lea OPENSSL_ia32cap_P(%rip),%r11
119 mov \$1,%eax
120 cmp \$0,`$win64?"%rcx":"%rdi"`
121 je .Lprobe
122 mov 0(%r11),%eax
123 mov 4(%r11),%r10
125 $code.=<<___ if ($shaext);
126 bt \$61,%r10 # check for SHA
127 jc ${func}_shaext
129 $code.=<<___;
130 mov %r10,%r11
131 shr \$32,%r11
133 test \$`1<<11`,%r10d # check for XOP
134 jnz ${func}_xop
136 $code.=<<___ if ($avx>1);
137 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
138 cmp \$`1<<8|1<<5|1<<3`,%r11d
139 je ${func}_avx2
141 $code.=<<___;
142 and \$`1<<28`,%r10d # check for AVX
143 jnz ${func}_avx
147 $code.=<<___;
148 xor %eax,%eax
149 cmp \$0,`$win64?"%rcx":"%rdi"`
150 je .Lprobe
152 .Lprobe:
154 .size $func,.-$func
156 .align 64
157 .type $TABLE,\@object
158 $TABLE:
159 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
160 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
161 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
162 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
163 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
164 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
165 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
166 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
167 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
168 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
169 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
170 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
171 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
172 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
173 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
174 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
175 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
176 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
177 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
178 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
179 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
180 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
181 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
182 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
183 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
184 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
185 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
186 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
187 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
188 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
189 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
190 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
192 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
193 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
194 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
195 .long 0,0,0,0, 0,0,0,0
196 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
197 .align 64
200 ######################################################################
201 # SIMD code paths
204 ($iv,$inout,$roundkey,$temp,
205 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
207 $aesni_cbc_idx=0;
208 @aesni_cbc_block = (
209 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
210 ## &vmovdqu ($inout,($inp));
211 ## &mov ($_inp,$inp);
213 '&vpxor ($inout,$inout,$roundkey);'.
214 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
216 '&vpxor ($inout,$inout,$iv);',
218 '&vaesenc ($inout,$inout,$roundkey);'.
219 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
221 '&vaesenc ($inout,$inout,$roundkey);'.
222 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
224 '&vaesenc ($inout,$inout,$roundkey);'.
225 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
227 '&vaesenc ($inout,$inout,$roundkey);'.
228 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
230 '&vaesenc ($inout,$inout,$roundkey);'.
231 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
233 '&vaesenc ($inout,$inout,$roundkey);'.
234 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
236 '&vaesenc ($inout,$inout,$roundkey);'.
237 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
239 '&vaesenc ($inout,$inout,$roundkey);'.
240 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
242 '&vaesenc ($inout,$inout,$roundkey);'.
243 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
245 '&vaesenclast ($temp,$inout,$roundkey);'.
246 ' &vaesenc ($inout,$inout,$roundkey);'.
247 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
249 '&vpand ($iv,$temp,$mask10);'.
250 ' &vaesenc ($inout,$inout,$roundkey);'.
251 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
253 '&vaesenclast ($temp,$inout,$roundkey);'.
254 ' &vaesenc ($inout,$inout,$roundkey);'.
255 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
257 '&vpand ($temp,$temp,$mask12);'.
258 ' &vaesenc ($inout,$inout,$roundkey);'.
259 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
261 '&vpor ($iv,$iv,$temp);'.
262 ' &vaesenclast ($temp,$inout,$roundkey);'.
263 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
265 ## &mov ($inp,$_inp);
266 ## &mov ($out,$_out);
267 ## &vpand ($temp,$temp,$mask14);
268 ## &vpor ($iv,$iv,$temp);
269 ## &vmovdqu ($iv,($out,$inp);
270 ## &lea (inp,16($inp));
273 my $a4=$T1;
274 my ($a,$b,$c,$d,$e,$f,$g,$h);
276 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
277 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
278 my $arg = pop;
279 $arg = "\$$arg" if ($arg*1 eq $arg);
280 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
283 sub body_00_15 () {
285 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
287 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
288 '&mov ($a,$a1)',
289 '&mov ($a4,$f)',
291 '&xor ($a0,$e)',
292 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
293 '&xor ($a4,$g)', # f^g
295 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
296 '&xor ($a1,$a)',
297 '&and ($a4,$e)', # (f^g)&e
299 @aesni_cbc_block[$aesni_cbc_idx++].
300 '&xor ($a0,$e)',
301 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
302 '&mov ($a2,$a)',
304 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
305 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
306 '&xor ($a2,$b)', # a^b, b^c in next round
308 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
309 '&add ($h,$a4)', # h+=Ch(e,f,g)
310 '&and ($a3,$a2)', # (b^c)&(a^b)
312 '&xor ($a1,$a)',
313 '&add ($h,$a0)', # h+=Sigma1(e)
314 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
316 '&add ($d,$h)', # d+=h
317 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
318 '&add ($h,$a3)', # h+=Maj(a,b,c)
320 '&mov ($a0,$d)',
321 '&add ($a1,$h);'. # h+=Sigma0(a)
322 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
326 if ($avx) {{
327 ######################################################################
328 # XOP code path
330 $code.=<<___;
331 .type ${func}_xop,\@function,6
332 .align 64
333 ${func}_xop:
334 .Lxop_shortcut:
335 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
336 push %rbx
337 push %rbp
338 push %r12
339 push %r13
340 push %r14
341 push %r15
342 mov %rsp,%r11 # copy %rsp
343 sub \$`$framesz+$win64*16*10`,%rsp
344 and \$-64,%rsp # align stack frame
346 shl \$6,$len
347 sub $inp,$out # re-bias
348 sub $inp,$in0
349 add $inp,$len # end of input
351 #mov $inp,$_inp # saved later
352 mov $out,$_out
353 mov $len,$_end
354 #mov $key,$_key # remains resident in $inp register
355 mov $ivp,$_ivp
356 mov $ctx,$_ctx
357 mov $in0,$_in0
358 mov %r11,$_rsp
360 $code.=<<___ if ($win64);
361 movaps %xmm6,`$framesz+16*0`(%rsp)
362 movaps %xmm7,`$framesz+16*1`(%rsp)
363 movaps %xmm8,`$framesz+16*2`(%rsp)
364 movaps %xmm9,`$framesz+16*3`(%rsp)
365 movaps %xmm10,`$framesz+16*4`(%rsp)
366 movaps %xmm11,`$framesz+16*5`(%rsp)
367 movaps %xmm12,`$framesz+16*6`(%rsp)
368 movaps %xmm13,`$framesz+16*7`(%rsp)
369 movaps %xmm14,`$framesz+16*8`(%rsp)
370 movaps %xmm15,`$framesz+16*9`(%rsp)
372 $code.=<<___;
373 .Lprologue_xop:
374 vzeroall
376 mov $inp,%r12 # borrow $a4
377 lea 0x80($key),$inp # size optimization, reassign
378 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
379 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
380 mov $ctx,%r15 # borrow $a2
381 mov $in0,%rsi # borrow $a3
382 vmovdqu ($ivp),$iv # load IV
383 sub \$9,%r14
385 mov $SZ*0(%r15),$A
386 mov $SZ*1(%r15),$B
387 mov $SZ*2(%r15),$C
388 mov $SZ*3(%r15),$D
389 mov $SZ*4(%r15),$E
390 mov $SZ*5(%r15),$F
391 mov $SZ*6(%r15),$G
392 mov $SZ*7(%r15),$H
394 vmovdqa 0x00(%r13,%r14,8),$mask14
395 vmovdqa 0x10(%r13,%r14,8),$mask12
396 vmovdqa 0x20(%r13,%r14,8),$mask10
397 vmovdqu 0x00-0x80($inp),$roundkey
398 jmp .Lloop_xop
400 if ($SZ==4) { # SHA256
401 my @X = map("%xmm$_",(0..3));
402 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
404 $code.=<<___;
405 .align 16
406 .Lloop_xop:
407 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
408 vmovdqu 0x00(%rsi,%r12),@X[0]
409 vmovdqu 0x10(%rsi,%r12),@X[1]
410 vmovdqu 0x20(%rsi,%r12),@X[2]
411 vmovdqu 0x30(%rsi,%r12),@X[3]
412 vpshufb $t3,@X[0],@X[0]
413 lea $TABLE(%rip),$Tbl
414 vpshufb $t3,@X[1],@X[1]
415 vpshufb $t3,@X[2],@X[2]
416 vpaddd 0x00($Tbl),@X[0],$t0
417 vpshufb $t3,@X[3],@X[3]
418 vpaddd 0x20($Tbl),@X[1],$t1
419 vpaddd 0x40($Tbl),@X[2],$t2
420 vpaddd 0x60($Tbl),@X[3],$t3
421 vmovdqa $t0,0x00(%rsp)
422 mov $A,$a1
423 vmovdqa $t1,0x10(%rsp)
424 mov $B,$a3
425 vmovdqa $t2,0x20(%rsp)
426 xor $C,$a3 # magic
427 vmovdqa $t3,0x30(%rsp)
428 mov $E,$a0
429 jmp .Lxop_00_47
431 .align 16
432 .Lxop_00_47:
433 sub \$-16*2*$SZ,$Tbl # size optimization
434 vmovdqu (%r12),$inout # $a4
435 mov %r12,$_inp # $a4
437 sub XOP_256_00_47 () {
438 my $j = shift;
439 my $body = shift;
440 my @X = @_;
441 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
443 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
444 eval(shift(@insns));
445 eval(shift(@insns));
446 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
447 eval(shift(@insns));
448 eval(shift(@insns));
449 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
450 eval(shift(@insns));
451 eval(shift(@insns));
452 &vpsrld ($t0,$t0,$sigma0[2]);
453 eval(shift(@insns));
454 eval(shift(@insns));
455 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
456 eval(shift(@insns));
457 eval(shift(@insns));
458 eval(shift(@insns));
459 eval(shift(@insns));
460 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
461 eval(shift(@insns));
462 eval(shift(@insns));
463 &vpxor ($t0,$t0,$t1);
464 eval(shift(@insns));
465 eval(shift(@insns));
466 eval(shift(@insns));
467 eval(shift(@insns));
468 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
469 eval(shift(@insns));
470 eval(shift(@insns));
471 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
472 eval(shift(@insns));
473 eval(shift(@insns));
474 &vpsrld ($t2,@X[3],$sigma1[2]);
475 eval(shift(@insns));
476 eval(shift(@insns));
477 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
478 eval(shift(@insns));
479 eval(shift(@insns));
480 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
481 eval(shift(@insns));
482 eval(shift(@insns));
483 &vpxor ($t3,$t3,$t2);
484 eval(shift(@insns));
485 eval(shift(@insns));
486 eval(shift(@insns));
487 eval(shift(@insns));
488 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
489 eval(shift(@insns));
490 eval(shift(@insns));
491 eval(shift(@insns));
492 eval(shift(@insns));
493 &vpsrldq ($t3,$t3,8);
494 eval(shift(@insns));
495 eval(shift(@insns));
496 eval(shift(@insns));
497 eval(shift(@insns));
498 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
499 eval(shift(@insns));
500 eval(shift(@insns));
501 eval(shift(@insns));
502 eval(shift(@insns));
503 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
504 eval(shift(@insns));
505 eval(shift(@insns));
506 &vpsrld ($t2,@X[0],$sigma1[2]);
507 eval(shift(@insns));
508 eval(shift(@insns));
509 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
510 eval(shift(@insns));
511 eval(shift(@insns));
512 &vpxor ($t3,$t3,$t2);
513 eval(shift(@insns));
514 eval(shift(@insns));
515 eval(shift(@insns));
516 eval(shift(@insns));
517 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
518 eval(shift(@insns));
519 eval(shift(@insns));
520 eval(shift(@insns));
521 eval(shift(@insns));
522 &vpslldq ($t3,$t3,8); # 22 instructions
523 eval(shift(@insns));
524 eval(shift(@insns));
525 eval(shift(@insns));
526 eval(shift(@insns));
527 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
528 eval(shift(@insns));
529 eval(shift(@insns));
530 eval(shift(@insns));
531 eval(shift(@insns));
532 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
533 foreach (@insns) { eval; } # remaining instructions
534 &vmovdqa (16*$j."(%rsp)",$t2);
537 $aesni_cbc_idx=0;
538 for ($i=0,$j=0; $j<4; $j++) {
539 &XOP_256_00_47($j,\&body_00_15,@X);
540 push(@X,shift(@X)); # rotate(@X)
542 &mov ("%r12",$_inp); # borrow $a4
543 &vpand ($temp,$temp,$mask14);
544 &mov ("%r15",$_out); # borrow $a2
545 &vpor ($iv,$iv,$temp);
546 &vmovdqu ("(%r15,%r12)",$iv); # write output
547 &lea ("%r12","16(%r12)"); # inp++
549 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
550 &jne (".Lxop_00_47");
552 &vmovdqu ($inout,"(%r12)");
553 &mov ($_inp,"%r12");
555 $aesni_cbc_idx=0;
556 for ($i=0; $i<16; ) {
557 foreach(body_00_15()) { eval; }
560 $code.=<<___;
561 mov $_inp,%r12 # borrow $a4
562 mov $_out,%r13 # borrow $a0
563 mov $_ctx,%r15 # borrow $a2
564 mov $_in0,%rsi # borrow $a3
566 vpand $mask14,$temp,$temp
567 mov $a1,$A
568 vpor $temp,$iv,$iv
569 vmovdqu $iv,(%r13,%r12) # write output
570 lea 16(%r12),%r12 # inp++
572 add $SZ*0(%r15),$A
573 add $SZ*1(%r15),$B
574 add $SZ*2(%r15),$C
575 add $SZ*3(%r15),$D
576 add $SZ*4(%r15),$E
577 add $SZ*5(%r15),$F
578 add $SZ*6(%r15),$G
579 add $SZ*7(%r15),$H
581 cmp $_end,%r12
583 mov $A,$SZ*0(%r15)
584 mov $B,$SZ*1(%r15)
585 mov $C,$SZ*2(%r15)
586 mov $D,$SZ*3(%r15)
587 mov $E,$SZ*4(%r15)
588 mov $F,$SZ*5(%r15)
589 mov $G,$SZ*6(%r15)
590 mov $H,$SZ*7(%r15)
592 jb .Lloop_xop
594 mov $_ivp,$ivp
595 mov $_rsp,%rsi
596 vmovdqu $iv,($ivp) # output IV
597 vzeroall
599 $code.=<<___ if ($win64);
600 movaps `$framesz+16*0`(%rsp),%xmm6
601 movaps `$framesz+16*1`(%rsp),%xmm7
602 movaps `$framesz+16*2`(%rsp),%xmm8
603 movaps `$framesz+16*3`(%rsp),%xmm9
604 movaps `$framesz+16*4`(%rsp),%xmm10
605 movaps `$framesz+16*5`(%rsp),%xmm11
606 movaps `$framesz+16*6`(%rsp),%xmm12
607 movaps `$framesz+16*7`(%rsp),%xmm13
608 movaps `$framesz+16*8`(%rsp),%xmm14
609 movaps `$framesz+16*9`(%rsp),%xmm15
611 $code.=<<___;
612 mov (%rsi),%r15
613 mov 8(%rsi),%r14
614 mov 16(%rsi),%r13
615 mov 24(%rsi),%r12
616 mov 32(%rsi),%rbp
617 mov 40(%rsi),%rbx
618 lea 48(%rsi),%rsp
619 .Lepilogue_xop:
621 .size ${func}_xop,.-${func}_xop
623 ######################################################################
624 # AVX+shrd code path
626 local *ror = sub { &shrd(@_[0],@_) };
628 $code.=<<___;
629 .type ${func}_avx,\@function,6
630 .align 64
631 ${func}_avx:
632 .Lavx_shortcut:
633 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
634 push %rbx
635 push %rbp
636 push %r12
637 push %r13
638 push %r14
639 push %r15
640 mov %rsp,%r11 # copy %rsp
641 sub \$`$framesz+$win64*16*10`,%rsp
642 and \$-64,%rsp # align stack frame
644 shl \$6,$len
645 sub $inp,$out # re-bias
646 sub $inp,$in0
647 add $inp,$len # end of input
649 #mov $inp,$_inp # saved later
650 mov $out,$_out
651 mov $len,$_end
652 #mov $key,$_key # remains resident in $inp register
653 mov $ivp,$_ivp
654 mov $ctx,$_ctx
655 mov $in0,$_in0
656 mov %r11,$_rsp
658 $code.=<<___ if ($win64);
659 movaps %xmm6,`$framesz+16*0`(%rsp)
660 movaps %xmm7,`$framesz+16*1`(%rsp)
661 movaps %xmm8,`$framesz+16*2`(%rsp)
662 movaps %xmm9,`$framesz+16*3`(%rsp)
663 movaps %xmm10,`$framesz+16*4`(%rsp)
664 movaps %xmm11,`$framesz+16*5`(%rsp)
665 movaps %xmm12,`$framesz+16*6`(%rsp)
666 movaps %xmm13,`$framesz+16*7`(%rsp)
667 movaps %xmm14,`$framesz+16*8`(%rsp)
668 movaps %xmm15,`$framesz+16*9`(%rsp)
670 $code.=<<___;
671 .Lprologue_avx:
672 vzeroall
674 mov $inp,%r12 # borrow $a4
675 lea 0x80($key),$inp # size optimization, reassign
676 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
677 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
678 mov $ctx,%r15 # borrow $a2
679 mov $in0,%rsi # borrow $a3
680 vmovdqu ($ivp),$iv # load IV
681 sub \$9,%r14
683 mov $SZ*0(%r15),$A
684 mov $SZ*1(%r15),$B
685 mov $SZ*2(%r15),$C
686 mov $SZ*3(%r15),$D
687 mov $SZ*4(%r15),$E
688 mov $SZ*5(%r15),$F
689 mov $SZ*6(%r15),$G
690 mov $SZ*7(%r15),$H
692 vmovdqa 0x00(%r13,%r14,8),$mask14
693 vmovdqa 0x10(%r13,%r14,8),$mask12
694 vmovdqa 0x20(%r13,%r14,8),$mask10
695 vmovdqu 0x00-0x80($inp),$roundkey
697 if ($SZ==4) { # SHA256
698 my @X = map("%xmm$_",(0..3));
699 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
701 $code.=<<___;
702 jmp .Lloop_avx
703 .align 16
704 .Lloop_avx:
705 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
706 vmovdqu 0x00(%rsi,%r12),@X[0]
707 vmovdqu 0x10(%rsi,%r12),@X[1]
708 vmovdqu 0x20(%rsi,%r12),@X[2]
709 vmovdqu 0x30(%rsi,%r12),@X[3]
710 vpshufb $t3,@X[0],@X[0]
711 lea $TABLE(%rip),$Tbl
712 vpshufb $t3,@X[1],@X[1]
713 vpshufb $t3,@X[2],@X[2]
714 vpaddd 0x00($Tbl),@X[0],$t0
715 vpshufb $t3,@X[3],@X[3]
716 vpaddd 0x20($Tbl),@X[1],$t1
717 vpaddd 0x40($Tbl),@X[2],$t2
718 vpaddd 0x60($Tbl),@X[3],$t3
719 vmovdqa $t0,0x00(%rsp)
720 mov $A,$a1
721 vmovdqa $t1,0x10(%rsp)
722 mov $B,$a3
723 vmovdqa $t2,0x20(%rsp)
724 xor $C,$a3 # magic
725 vmovdqa $t3,0x30(%rsp)
726 mov $E,$a0
727 jmp .Lavx_00_47
729 .align 16
730 .Lavx_00_47:
731 sub \$-16*2*$SZ,$Tbl # size optimization
732 vmovdqu (%r12),$inout # $a4
733 mov %r12,$_inp # $a4
735 sub Xupdate_256_AVX () {
737 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
738 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
739 '&vpsrld ($t2,$t0,$sigma0[0]);',
740 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
741 '&vpsrld ($t3,$t0,$sigma0[2])',
742 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
743 '&vpxor ($t0,$t3,$t2)',
744 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
745 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
746 '&vpxor ($t0,$t0,$t1)',
747 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
748 '&vpxor ($t0,$t0,$t2)',
749 '&vpsrld ($t2,$t3,$sigma1[2]);',
750 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
751 '&vpsrlq ($t3,$t3,$sigma1[0]);',
752 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
753 '&vpxor ($t2,$t2,$t3);',
754 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
755 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
756 '&vpshufd ($t2,$t2,0b10000100)',
757 '&vpsrldq ($t2,$t2,8)',
758 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
759 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
760 '&vpsrld ($t2,$t3,$sigma1[2])',
761 '&vpsrlq ($t3,$t3,$sigma1[0])',
762 '&vpxor ($t2,$t2,$t3);',
763 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
764 '&vpxor ($t2,$t2,$t3)',
765 '&vpshufd ($t2,$t2,0b11101000)',
766 '&vpslldq ($t2,$t2,8)',
767 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
771 sub AVX_256_00_47 () {
772 my $j = shift;
773 my $body = shift;
774 my @X = @_;
775 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
777 foreach (Xupdate_256_AVX()) { # 29 instructions
778 eval;
779 eval(shift(@insns));
780 eval(shift(@insns));
781 eval(shift(@insns));
783 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
784 foreach (@insns) { eval; } # remaining instructions
785 &vmovdqa (16*$j."(%rsp)",$t2);
788 $aesni_cbc_idx=0;
789 for ($i=0,$j=0; $j<4; $j++) {
790 &AVX_256_00_47($j,\&body_00_15,@X);
791 push(@X,shift(@X)); # rotate(@X)
793 &mov ("%r12",$_inp); # borrow $a4
794 &vpand ($temp,$temp,$mask14);
795 &mov ("%r15",$_out); # borrow $a2
796 &vpor ($iv,$iv,$temp);
797 &vmovdqu ("(%r15,%r12)",$iv); # write output
798 &lea ("%r12","16(%r12)"); # inp++
800 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
801 &jne (".Lavx_00_47");
803 &vmovdqu ($inout,"(%r12)");
804 &mov ($_inp,"%r12");
806 $aesni_cbc_idx=0;
807 for ($i=0; $i<16; ) {
808 foreach(body_00_15()) { eval; }
812 $code.=<<___;
813 mov $_inp,%r12 # borrow $a4
814 mov $_out,%r13 # borrow $a0
815 mov $_ctx,%r15 # borrow $a2
816 mov $_in0,%rsi # borrow $a3
818 vpand $mask14,$temp,$temp
819 mov $a1,$A
820 vpor $temp,$iv,$iv
821 vmovdqu $iv,(%r13,%r12) # write output
822 lea 16(%r12),%r12 # inp++
824 add $SZ*0(%r15),$A
825 add $SZ*1(%r15),$B
826 add $SZ*2(%r15),$C
827 add $SZ*3(%r15),$D
828 add $SZ*4(%r15),$E
829 add $SZ*5(%r15),$F
830 add $SZ*6(%r15),$G
831 add $SZ*7(%r15),$H
833 cmp $_end,%r12
835 mov $A,$SZ*0(%r15)
836 mov $B,$SZ*1(%r15)
837 mov $C,$SZ*2(%r15)
838 mov $D,$SZ*3(%r15)
839 mov $E,$SZ*4(%r15)
840 mov $F,$SZ*5(%r15)
841 mov $G,$SZ*6(%r15)
842 mov $H,$SZ*7(%r15)
843 jb .Lloop_avx
845 mov $_ivp,$ivp
846 mov $_rsp,%rsi
847 vmovdqu $iv,($ivp) # output IV
848 vzeroall
850 $code.=<<___ if ($win64);
851 movaps `$framesz+16*0`(%rsp),%xmm6
852 movaps `$framesz+16*1`(%rsp),%xmm7
853 movaps `$framesz+16*2`(%rsp),%xmm8
854 movaps `$framesz+16*3`(%rsp),%xmm9
855 movaps `$framesz+16*4`(%rsp),%xmm10
856 movaps `$framesz+16*5`(%rsp),%xmm11
857 movaps `$framesz+16*6`(%rsp),%xmm12
858 movaps `$framesz+16*7`(%rsp),%xmm13
859 movaps `$framesz+16*8`(%rsp),%xmm14
860 movaps `$framesz+16*9`(%rsp),%xmm15
862 $code.=<<___;
863 mov (%rsi),%r15
864 mov 8(%rsi),%r14
865 mov 16(%rsi),%r13
866 mov 24(%rsi),%r12
867 mov 32(%rsi),%rbp
868 mov 40(%rsi),%rbx
869 lea 48(%rsi),%rsp
870 .Lepilogue_avx:
872 .size ${func}_avx,.-${func}_avx
875 if ($avx>1) {{
876 ######################################################################
877 # AVX2+BMI code path
879 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
880 my $PUSH8=8*2*$SZ;
881 use integer;
883 sub bodyx_00_15 () {
884 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
886 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
888 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
889 '&and ($a4,$e)', # f&e
890 '&rorx ($a0,$e,$Sigma1[2])',
891 '&rorx ($a2,$e,$Sigma1[1])',
893 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
894 '&lea ($h,"($h,$a4)")',
895 '&andn ($a4,$e,$g)', # ~e&g
896 '&xor ($a0,$a2)',
898 '&rorx ($a1,$e,$Sigma1[0])',
899 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
900 '&xor ($a0,$a1)', # Sigma1(e)
901 '&mov ($a2,$a)',
903 '&rorx ($a4,$a,$Sigma0[2])',
904 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
905 '&xor ($a2,$b)', # a^b, b^c in next round
906 '&rorx ($a1,$a,$Sigma0[1])',
908 '&rorx ($a0,$a,$Sigma0[0])',
909 '&lea ($d,"($d,$h)")', # d+=h
910 '&and ($a3,$a2)', # (b^c)&(a^b)
911 @aesni_cbc_block[$aesni_cbc_idx++].
912 '&xor ($a1,$a4)',
914 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
915 '&xor ($a1,$a0)', # Sigma0(a)
916 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
917 '&mov ($a4,$e)', # copy of f in future
919 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
921 # and at the finish one has to $a+=$a1
924 $code.=<<___;
925 .type ${func}_avx2,\@function,6
926 .align 64
927 ${func}_avx2:
928 .Lavx2_shortcut:
929 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
930 push %rbx
931 push %rbp
932 push %r12
933 push %r13
934 push %r14
935 push %r15
936 mov %rsp,%r11 # copy %rsp
937 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
938 and \$-256*$SZ,%rsp # align stack frame
939 add \$`2*$SZ*($rounds-8)`,%rsp
941 shl \$6,$len
942 sub $inp,$out # re-bias
943 sub $inp,$in0
944 add $inp,$len # end of input
946 #mov $inp,$_inp # saved later
947 #mov $out,$_out # kept in $offload
948 mov $len,$_end
949 #mov $key,$_key # remains resident in $inp register
950 mov $ivp,$_ivp
951 mov $ctx,$_ctx
952 mov $in0,$_in0
953 mov %r11,$_rsp
955 $code.=<<___ if ($win64);
956 movaps %xmm6,`$framesz+16*0`(%rsp)
957 movaps %xmm7,`$framesz+16*1`(%rsp)
958 movaps %xmm8,`$framesz+16*2`(%rsp)
959 movaps %xmm9,`$framesz+16*3`(%rsp)
960 movaps %xmm10,`$framesz+16*4`(%rsp)
961 movaps %xmm11,`$framesz+16*5`(%rsp)
962 movaps %xmm12,`$framesz+16*6`(%rsp)
963 movaps %xmm13,`$framesz+16*7`(%rsp)
964 movaps %xmm14,`$framesz+16*8`(%rsp)
965 movaps %xmm15,`$framesz+16*9`(%rsp)
967 $code.=<<___;
968 .Lprologue_avx2:
969 vzeroall
971 mov $inp,%r13 # borrow $a0
972 vpinsrq \$1,$out,$offload,$offload
973 lea 0x80($key),$inp # size optimization, reassign
974 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
975 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
976 mov $ctx,%r15 # borrow $a2
977 mov $in0,%rsi # borrow $a3
978 vmovdqu ($ivp),$iv # load IV
979 lea -9(%r14),%r14
981 vmovdqa 0x00(%r12,%r14,8),$mask14
982 vmovdqa 0x10(%r12,%r14,8),$mask12
983 vmovdqa 0x20(%r12,%r14,8),$mask10
985 sub \$-16*$SZ,%r13 # inp++, size optimization
986 mov $SZ*0(%r15),$A
987 lea (%rsi,%r13),%r12 # borrow $a0
988 mov $SZ*1(%r15),$B
989 cmp $len,%r13 # $_end
990 mov $SZ*2(%r15),$C
991 cmove %rsp,%r12 # next block or random data
992 mov $SZ*3(%r15),$D
993 mov $SZ*4(%r15),$E
994 mov $SZ*5(%r15),$F
995 mov $SZ*6(%r15),$G
996 mov $SZ*7(%r15),$H
997 vmovdqu 0x00-0x80($inp),$roundkey
999 if ($SZ==4) { # SHA256
1000 my @X = map("%ymm$_",(0..3));
1001 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1003 $code.=<<___;
1004 jmp .Loop_avx2
1005 .align 16
1006 .Loop_avx2:
1007 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1008 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1009 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1010 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1011 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1013 vinserti128 \$1,(%r12),@X[0],@X[0]
1014 vinserti128 \$1,16(%r12),@X[1],@X[1]
1015 vpshufb $t3,@X[0],@X[0]
1016 vinserti128 \$1,32(%r12),@X[2],@X[2]
1017 vpshufb $t3,@X[1],@X[1]
1018 vinserti128 \$1,48(%r12),@X[3],@X[3]
1020 lea $TABLE(%rip),$Tbl
1021 vpshufb $t3,@X[2],@X[2]
1022 lea -16*$SZ(%r13),%r13
1023 vpaddd 0x00($Tbl),@X[0],$t0
1024 vpshufb $t3,@X[3],@X[3]
1025 vpaddd 0x20($Tbl),@X[1],$t1
1026 vpaddd 0x40($Tbl),@X[2],$t2
1027 vpaddd 0x60($Tbl),@X[3],$t3
1028 vmovdqa $t0,0x00(%rsp)
1029 xor $a1,$a1
1030 vmovdqa $t1,0x20(%rsp)
1031 lea -$PUSH8(%rsp),%rsp
1032 mov $B,$a3
1033 vmovdqa $t2,0x00(%rsp)
1034 xor $C,$a3 # magic
1035 vmovdqa $t3,0x20(%rsp)
1036 mov $F,$a4
1037 sub \$-16*2*$SZ,$Tbl # size optimization
1038 jmp .Lavx2_00_47
1040 .align 16
1041 .Lavx2_00_47:
1042 vmovdqu (%r13),$inout
1043 vpinsrq \$0,%r13,$offload,$offload
1046 sub AVX2_256_00_47 () {
1047 my $j = shift;
1048 my $body = shift;
1049 my @X = @_;
1050 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1051 my $base = "+2*$PUSH8(%rsp)";
1053 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1054 foreach (Xupdate_256_AVX()) { # 29 instructions
1055 eval;
1056 eval(shift(@insns));
1057 eval(shift(@insns));
1058 eval(shift(@insns));
1060 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1061 foreach (@insns) { eval; } # remaining instructions
1062 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1064 $aesni_cbc_idx=0;
1065 for ($i=0,$j=0; $j<4; $j++) {
1066 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1067 push(@X,shift(@X)); # rotate(@X)
1069 &vmovq ("%r13",$offload); # borrow $a0
1070 &vpextrq ("%r15",$offload,1); # borrow $a2
1071 &vpand ($temp,$temp,$mask14);
1072 &vpor ($iv,$iv,$temp);
1073 &vmovdqu ("(%r15,%r13)",$iv); # write output
1074 &lea ("%r13","16(%r13)"); # inp++
1076 &lea ($Tbl,16*2*$SZ."($Tbl)");
1077 &cmpb (($SZ-1)."($Tbl)",0);
1078 &jne (".Lavx2_00_47");
1080 &vmovdqu ($inout,"(%r13)");
1081 &vpinsrq ($offload,$offload,"%r13",0);
1083 $aesni_cbc_idx=0;
1084 for ($i=0; $i<16; ) {
1085 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1086 foreach(bodyx_00_15()) { eval; }
1089 $code.=<<___;
1090 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1091 vmovq $offload,%r13 # $_inp, borrow $a0
1092 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1093 add $a1,$A
1094 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1096 vpand $mask14,$temp,$temp
1097 vpor $temp,$iv,$iv
1098 vmovdqu $iv,(%r12,%r13) # write output
1099 lea 16(%r13),%r13
1101 add $SZ*0(%r15),$A
1102 add $SZ*1(%r15),$B
1103 add $SZ*2(%r15),$C
1104 add $SZ*3(%r15),$D
1105 add $SZ*4(%r15),$E
1106 add $SZ*5(%r15),$F
1107 add $SZ*6(%r15),$G
1108 add $SZ*7(%r15),$H
1110 mov $A,$SZ*0(%r15)
1111 mov $B,$SZ*1(%r15)
1112 mov $C,$SZ*2(%r15)
1113 mov $D,$SZ*3(%r15)
1114 mov $E,$SZ*4(%r15)
1115 mov $F,$SZ*5(%r15)
1116 mov $G,$SZ*6(%r15)
1117 mov $H,$SZ*7(%r15)
1119 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1120 je .Ldone_avx2
1122 xor $a1,$a1
1123 mov $B,$a3
1124 mov $F,$a4
1125 xor $C,$a3 # magic
1126 jmp .Lower_avx2
1127 .align 16
1128 .Lower_avx2:
1129 vmovdqu (%r13),$inout
1130 vpinsrq \$0,%r13,$offload,$offload
1132 $aesni_cbc_idx=0;
1133 for ($i=0; $i<16; ) {
1134 my $base="+16($Tbl)";
1135 foreach(bodyx_00_15()) { eval; }
1136 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1138 $code.=<<___;
1139 vmovq $offload,%r13 # borrow $a0
1140 vpextrq \$1,$offload,%r15 # borrow $a2
1141 vpand $mask14,$temp,$temp
1142 vpor $temp,$iv,$iv
1143 lea -$PUSH8($Tbl),$Tbl
1144 vmovdqu $iv,(%r15,%r13) # write output
1145 lea 16(%r13),%r13 # inp++
1146 cmp %rsp,$Tbl
1147 jae .Lower_avx2
1149 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1150 lea 16*$SZ(%r13),%r13
1151 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1152 add $a1,$A
1153 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1155 add $SZ*0(%r15),$A
1156 add $SZ*1(%r15),$B
1157 add $SZ*2(%r15),$C
1158 add $SZ*3(%r15),$D
1159 add $SZ*4(%r15),$E
1160 add $SZ*5(%r15),$F
1161 add $SZ*6(%r15),$G
1162 lea (%rsi,%r13),%r12
1163 add $SZ*7(%r15),$H
1165 cmp $_end,%r13
1167 mov $A,$SZ*0(%r15)
1168 cmove %rsp,%r12 # next block or stale data
1169 mov $B,$SZ*1(%r15)
1170 mov $C,$SZ*2(%r15)
1171 mov $D,$SZ*3(%r15)
1172 mov $E,$SZ*4(%r15)
1173 mov $F,$SZ*5(%r15)
1174 mov $G,$SZ*6(%r15)
1175 mov $H,$SZ*7(%r15)
1177 jbe .Loop_avx2
1178 lea (%rsp),$Tbl
1180 .Ldone_avx2:
1181 lea ($Tbl),%rsp
1182 mov $_ivp,$ivp
1183 mov $_rsp,%rsi
1184 vmovdqu $iv,($ivp) # output IV
1185 vzeroall
1187 $code.=<<___ if ($win64);
1188 movaps `$framesz+16*0`(%rsp),%xmm6
1189 movaps `$framesz+16*1`(%rsp),%xmm7
1190 movaps `$framesz+16*2`(%rsp),%xmm8
1191 movaps `$framesz+16*3`(%rsp),%xmm9
1192 movaps `$framesz+16*4`(%rsp),%xmm10
1193 movaps `$framesz+16*5`(%rsp),%xmm11
1194 movaps `$framesz+16*6`(%rsp),%xmm12
1195 movaps `$framesz+16*7`(%rsp),%xmm13
1196 movaps `$framesz+16*8`(%rsp),%xmm14
1197 movaps `$framesz+16*9`(%rsp),%xmm15
1199 $code.=<<___;
1200 mov (%rsi),%r15
1201 mov 8(%rsi),%r14
1202 mov 16(%rsi),%r13
1203 mov 24(%rsi),%r12
1204 mov 32(%rsi),%rbp
1205 mov 40(%rsi),%rbx
1206 lea 48(%rsi),%rsp
1207 .Lepilogue_avx2:
1209 .size ${func}_avx2,.-${func}_avx2
1214 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1216 my ($rounds,$Tbl)=("%r11d","%rbx");
1218 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1219 my @rndkey=("%xmm4","%xmm5");
1220 my $r=0;
1221 my $sn=0;
1223 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1224 my @MSG=map("%xmm$_",(10..13));
1226 my $aesenc=sub {
1227 use integer;
1228 my ($n,$k)=($r/10,$r%10);
1229 if ($k==0) {
1230 $code.=<<___;
1231 movups `16*$n`($in0),$in # load input
1232 xorps $rndkey0,$in
1234 $code.=<<___ if ($n);
1235 movups $iv,`16*($n-1)`($out,$in0) # write output
1237 $code.=<<___;
1238 xorps $in,$iv
1239 movups `32+16*$k-112`($key),$rndkey[1]
1240 aesenc $rndkey[0],$iv
1242 } elsif ($k==9) {
1243 $sn++;
1244 $code.=<<___;
1245 cmp \$11,$rounds
1246 jb .Laesenclast$sn
1247 movups `32+16*($k+0)-112`($key),$rndkey[1]
1248 aesenc $rndkey[0],$iv
1249 movups `32+16*($k+1)-112`($key),$rndkey[0]
1250 aesenc $rndkey[1],$iv
1251 je .Laesenclast$sn
1252 movups `32+16*($k+2)-112`($key),$rndkey[1]
1253 aesenc $rndkey[0],$iv
1254 movups `32+16*($k+3)-112`($key),$rndkey[0]
1255 aesenc $rndkey[1],$iv
1256 .Laesenclast$sn:
1257 aesenclast $rndkey[0],$iv
1258 movups 16-112($key),$rndkey[1] # forward reference
1261 } else {
1262 $code.=<<___;
1263 movups `32+16*$k-112`($key),$rndkey[1]
1264 aesenc $rndkey[0],$iv
1267 $r++; unshift(@rndkey,pop(@rndkey));
1270 if ($shaext) {
1271 my $Tbl="%rax";
1273 $code.=<<___;
1274 .type ${func}_shaext,\@function,6
1275 .align 32
1276 ${func}_shaext:
1277 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1279 $code.=<<___ if ($win64);
1280 lea `-8-10*16`(%rsp),%rsp
1281 movaps %xmm6,-8-10*16(%rax)
1282 movaps %xmm7,-8-9*16(%rax)
1283 movaps %xmm8,-8-8*16(%rax)
1284 movaps %xmm9,-8-7*16(%rax)
1285 movaps %xmm10,-8-6*16(%rax)
1286 movaps %xmm11,-8-5*16(%rax)
1287 movaps %xmm12,-8-4*16(%rax)
1288 movaps %xmm13,-8-3*16(%rax)
1289 movaps %xmm14,-8-2*16(%rax)
1290 movaps %xmm15,-8-1*16(%rax)
1291 .Lprologue_shaext:
1293 $code.=<<___;
1294 lea K256+0x80(%rip),$Tbl
1295 movdqu ($ctx),$ABEF # DCBA
1296 movdqu 16($ctx),$CDGH # HGFE
1297 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1299 mov 240($key),$rounds
1300 sub $in0,$out
1301 movups ($key),$rndkey0 # $key[0]
1302 movups 16($key),$rndkey[0] # forward reference
1303 lea 112($key),$key # size optimization
1305 pshufd \$0x1b,$ABEF,$Wi # ABCD
1306 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1307 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1308 movdqa $TMP,$BSWAP # offload
1309 palignr \$8,$CDGH,$ABEF # ABEF
1310 punpcklqdq $Wi,$CDGH # CDGH
1312 jmp .Loop_shaext
1314 .align 16
1315 .Loop_shaext:
1316 movdqu ($inp),@MSG[0]
1317 movdqu 0x10($inp),@MSG[1]
1318 movdqu 0x20($inp),@MSG[2]
1319 pshufb $TMP,@MSG[0]
1320 movdqu 0x30($inp),@MSG[3]
1322 movdqa 0*32-0x80($Tbl),$Wi
1323 paddd @MSG[0],$Wi
1324 pshufb $TMP,@MSG[1]
1325 movdqa $CDGH,$CDGH_SAVE # offload
1326 movdqa $ABEF,$ABEF_SAVE # offload
1328 &$aesenc();
1329 $code.=<<___;
1330 sha256rnds2 $ABEF,$CDGH # 0-3
1331 pshufd \$0x0e,$Wi,$Wi
1333 &$aesenc();
1334 $code.=<<___;
1335 sha256rnds2 $CDGH,$ABEF
1337 movdqa 1*32-0x80($Tbl),$Wi
1338 paddd @MSG[1],$Wi
1339 pshufb $TMP,@MSG[2]
1340 lea 0x40($inp),$inp
1342 &$aesenc();
1343 $code.=<<___;
1344 sha256rnds2 $ABEF,$CDGH # 4-7
1345 pshufd \$0x0e,$Wi,$Wi
1347 &$aesenc();
1348 $code.=<<___;
1349 sha256rnds2 $CDGH,$ABEF
1351 movdqa 2*32-0x80($Tbl),$Wi
1352 paddd @MSG[2],$Wi
1353 pshufb $TMP,@MSG[3]
1354 sha256msg1 @MSG[1],@MSG[0]
1356 &$aesenc();
1357 $code.=<<___;
1358 sha256rnds2 $ABEF,$CDGH # 8-11
1359 pshufd \$0x0e,$Wi,$Wi
1360 movdqa @MSG[3],$TMP
1361 palignr \$4,@MSG[2],$TMP
1362 paddd $TMP,@MSG[0]
1364 &$aesenc();
1365 $code.=<<___;
1366 sha256rnds2 $CDGH,$ABEF
1368 movdqa 3*32-0x80($Tbl),$Wi
1369 paddd @MSG[3],$Wi
1370 sha256msg2 @MSG[3],@MSG[0]
1371 sha256msg1 @MSG[2],@MSG[1]
1373 &$aesenc();
1374 $code.=<<___;
1375 sha256rnds2 $ABEF,$CDGH # 12-15
1376 pshufd \$0x0e,$Wi,$Wi
1378 &$aesenc();
1379 $code.=<<___;
1380 movdqa @MSG[0],$TMP
1381 palignr \$4,@MSG[3],$TMP
1382 paddd $TMP,@MSG[1]
1383 sha256rnds2 $CDGH,$ABEF
1385 for($i=4;$i<16-3;$i++) {
1386 &$aesenc() if (($r%10)==0);
1387 $code.=<<___;
1388 movdqa $i*32-0x80($Tbl),$Wi
1389 paddd @MSG[0],$Wi
1390 sha256msg2 @MSG[0],@MSG[1]
1391 sha256msg1 @MSG[3],@MSG[2]
1393 &$aesenc();
1394 $code.=<<___;
1395 sha256rnds2 $ABEF,$CDGH # 16-19...
1396 pshufd \$0x0e,$Wi,$Wi
1397 movdqa @MSG[1],$TMP
1398 palignr \$4,@MSG[0],$TMP
1399 paddd $TMP,@MSG[2]
1401 &$aesenc();
1402 &$aesenc() if ($r==19);
1403 $code.=<<___;
1404 sha256rnds2 $CDGH,$ABEF
1406 push(@MSG,shift(@MSG));
1408 $code.=<<___;
1409 movdqa 13*32-0x80($Tbl),$Wi
1410 paddd @MSG[0],$Wi
1411 sha256msg2 @MSG[0],@MSG[1]
1412 sha256msg1 @MSG[3],@MSG[2]
1414 &$aesenc();
1415 $code.=<<___;
1416 sha256rnds2 $ABEF,$CDGH # 52-55
1417 pshufd \$0x0e,$Wi,$Wi
1418 movdqa @MSG[1],$TMP
1419 palignr \$4,@MSG[0],$TMP
1420 paddd $TMP,@MSG[2]
1422 &$aesenc();
1423 &$aesenc();
1424 $code.=<<___;
1425 sha256rnds2 $CDGH,$ABEF
1427 movdqa 14*32-0x80($Tbl),$Wi
1428 paddd @MSG[1],$Wi
1429 sha256msg2 @MSG[1],@MSG[2]
1430 movdqa $BSWAP,$TMP
1432 &$aesenc();
1433 $code.=<<___;
1434 sha256rnds2 $ABEF,$CDGH # 56-59
1435 pshufd \$0x0e,$Wi,$Wi
1437 &$aesenc();
1438 $code.=<<___;
1439 sha256rnds2 $CDGH,$ABEF
1441 movdqa 15*32-0x80($Tbl),$Wi
1442 paddd @MSG[2],$Wi
1444 &$aesenc();
1445 &$aesenc();
1446 $code.=<<___;
1447 sha256rnds2 $ABEF,$CDGH # 60-63
1448 pshufd \$0x0e,$Wi,$Wi
1450 &$aesenc();
1451 $code.=<<___;
1452 sha256rnds2 $CDGH,$ABEF
1453 #pxor $CDGH,$rndkey0 # black magic
1455 while ($r<40) { &$aesenc(); } # remaining aesenc's
1456 $code.=<<___;
1457 #xorps $CDGH,$rndkey0 # black magic
1458 paddd $CDGH_SAVE,$CDGH
1459 paddd $ABEF_SAVE,$ABEF
1461 dec $len
1462 movups $iv,48($out,$in0) # write output
1463 lea 64($in0),$in0
1464 jnz .Loop_shaext
1466 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1467 pshufd \$0x1b,$ABEF,$TMP # FEBA
1468 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1469 punpckhqdq $CDGH,$ABEF # DCBA
1470 palignr \$8,$TMP,$CDGH # HGFE
1472 movups $iv,($ivp) # write IV
1473 movdqu $ABEF,($ctx)
1474 movdqu $CDGH,16($ctx)
1476 $code.=<<___ if ($win64);
1477 movaps 0*16(%rsp),%xmm6
1478 movaps 1*16(%rsp),%xmm7
1479 movaps 2*16(%rsp),%xmm8
1480 movaps 3*16(%rsp),%xmm9
1481 movaps 4*16(%rsp),%xmm10
1482 movaps 5*16(%rsp),%xmm11
1483 movaps 6*16(%rsp),%xmm12
1484 movaps 7*16(%rsp),%xmm13
1485 movaps 8*16(%rsp),%xmm14
1486 movaps 9*16(%rsp),%xmm15
1487 lea 8+10*16(%rsp),%rsp
1488 .Lepilogue_shaext:
1490 $code.=<<___;
1492 .size ${func}_shaext,.-${func}_shaext
1495 }}}}}
1497 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1498 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1499 if ($win64 && $avx) {
1500 $rec="%rcx";
1501 $frame="%rdx";
1502 $context="%r8";
1503 $disp="%r9";
1505 $code.=<<___;
1506 .extern __imp_RtlVirtualUnwind
1507 .type se_handler,\@abi-omnipotent
1508 .align 16
1509 se_handler:
1510 push %rsi
1511 push %rdi
1512 push %rbx
1513 push %rbp
1514 push %r12
1515 push %r13
1516 push %r14
1517 push %r15
1518 pushfq
1519 sub \$64,%rsp
1521 mov 120($context),%rax # pull context->Rax
1522 mov 248($context),%rbx # pull context->Rip
1524 mov 8($disp),%rsi # disp->ImageBase
1525 mov 56($disp),%r11 # disp->HanderlData
1527 mov 0(%r11),%r10d # HandlerData[0]
1528 lea (%rsi,%r10),%r10 # prologue label
1529 cmp %r10,%rbx # context->Rip<prologue label
1530 jb .Lin_prologue
1532 mov 152($context),%rax # pull context->Rsp
1534 mov 4(%r11),%r10d # HandlerData[1]
1535 lea (%rsi,%r10),%r10 # epilogue label
1536 cmp %r10,%rbx # context->Rip>=epilogue label
1537 jae .Lin_prologue
1539 $code.=<<___ if ($shaext);
1540 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1541 cmp %r10,%rbx
1542 jb .Lnot_in_shaext
1544 lea (%rax),%rsi
1545 lea 512($context),%rdi # &context.Xmm6
1546 mov \$20,%ecx
1547 .long 0xa548f3fc # cld; rep movsq
1548 lea 168(%rax),%rax # adjust stack pointer
1549 jmp .Lin_prologue
1550 .Lnot_in_shaext:
1552 $code.=<<___ if ($avx>1);
1553 lea .Lavx2_shortcut(%rip),%r10
1554 cmp %r10,%rbx # context->Rip<avx2_shortcut
1555 jb .Lnot_in_avx2
1557 and \$-256*$SZ,%rax
1558 add \$`2*$SZ*($rounds-8)`,%rax
1559 .Lnot_in_avx2:
1561 $code.=<<___;
1562 mov %rax,%rsi # put aside Rsp
1563 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1564 lea 48(%rax),%rax
1566 mov -8(%rax),%rbx
1567 mov -16(%rax),%rbp
1568 mov -24(%rax),%r12
1569 mov -32(%rax),%r13
1570 mov -40(%rax),%r14
1571 mov -48(%rax),%r15
1572 mov %rbx,144($context) # restore context->Rbx
1573 mov %rbp,160($context) # restore context->Rbp
1574 mov %r12,216($context) # restore context->R12
1575 mov %r13,224($context) # restore context->R13
1576 mov %r14,232($context) # restore context->R14
1577 mov %r15,240($context) # restore context->R15
1579 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1580 lea 512($context),%rdi # &context.Xmm6
1581 mov \$20,%ecx
1582 .long 0xa548f3fc # cld; rep movsq
1584 .Lin_prologue:
1585 mov 8(%rax),%rdi
1586 mov 16(%rax),%rsi
1587 mov %rax,152($context) # restore context->Rsp
1588 mov %rsi,168($context) # restore context->Rsi
1589 mov %rdi,176($context) # restore context->Rdi
1591 mov 40($disp),%rdi # disp->ContextRecord
1592 mov $context,%rsi # context
1593 mov \$154,%ecx # sizeof(CONTEXT)
1594 .long 0xa548f3fc # cld; rep movsq
1596 mov $disp,%rsi
1597 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1598 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1599 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1600 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1601 mov 40(%rsi),%r10 # disp->ContextRecord
1602 lea 56(%rsi),%r11 # &disp->HandlerData
1603 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1604 mov %r10,32(%rsp) # arg5
1605 mov %r11,40(%rsp) # arg6
1606 mov %r12,48(%rsp) # arg7
1607 mov %rcx,56(%rsp) # arg8, (NULL)
1608 call *__imp_RtlVirtualUnwind(%rip)
1610 mov \$1,%eax # ExceptionContinueSearch
1611 add \$64,%rsp
1612 popfq
1613 pop %r15
1614 pop %r14
1615 pop %r13
1616 pop %r12
1617 pop %rbp
1618 pop %rbx
1619 pop %rdi
1620 pop %rsi
1622 .size se_handler,.-se_handler
1624 .section .pdata
1625 .rva .LSEH_begin_${func}_xop
1626 .rva .LSEH_end_${func}_xop
1627 .rva .LSEH_info_${func}_xop
1629 .rva .LSEH_begin_${func}_avx
1630 .rva .LSEH_end_${func}_avx
1631 .rva .LSEH_info_${func}_avx
1633 $code.=<<___ if ($avx>1);
1634 .rva .LSEH_begin_${func}_avx2
1635 .rva .LSEH_end_${func}_avx2
1636 .rva .LSEH_info_${func}_avx2
1638 $code.=<<___ if ($shaext);
1639 .rva .LSEH_begin_${func}_shaext
1640 .rva .LSEH_end_${func}_shaext
1641 .rva .LSEH_info_${func}_shaext
1643 $code.=<<___;
1644 .section .xdata
1645 .align 8
1646 .LSEH_info_${func}_xop:
1647 .byte 9,0,0,0
1648 .rva se_handler
1649 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1651 .LSEH_info_${func}_avx:
1652 .byte 9,0,0,0
1653 .rva se_handler
1654 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1656 $code.=<<___ if ($avx>1);
1657 .LSEH_info_${func}_avx2:
1658 .byte 9,0,0,0
1659 .rva se_handler
1660 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1662 $code.=<<___ if ($shaext);
1663 .LSEH_info_${func}_shaext:
1664 .byte 9,0,0,0
1665 .rva se_handler
1666 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1670 ####################################################################
1671 sub rex {
1672 local *opcode=shift;
1673 my ($dst,$src)=@_;
1674 my $rex=0;
1676 $rex|=0x04 if($dst>=8);
1677 $rex|=0x01 if($src>=8);
1678 unshift @opcode,$rex|0x40 if($rex);
1682 my %opcodelet = (
1683 "sha256rnds2" => 0xcb,
1684 "sha256msg1" => 0xcc,
1685 "sha256msg2" => 0xcd );
1687 sub sha256op38 {
1688 my $instr = shift;
1690 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1691 my @opcode=(0x0f,0x38);
1692 rex(\@opcode,$2,$1);
1693 push @opcode,$opcodelet{$instr};
1694 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1695 return ".byte\t".join(',',@opcode);
1696 } else {
1697 return $instr."\t".@_[0];
1702 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1703 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1704 print $code;
1705 close STDOUT;