OpenSSL 1.0.2f
[tomato.git] / release / src / router / openssl / crypto / sha / asm / sha256-mb-x86_64.pl
blob9770286b9596adb485e743f421796b74df5e6219
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # Multi-buffer SHA256 procedure processes n buffers in parallel by
11 # placing buffer data to designated lane of SIMD register. n is
12 # naturally limited to 4 on pre-AVX2 processors and to 8 on
13 # AVX2-capable processors such as Haswell.
15 # this +aesni(i) sha256 aesni-sha256 gain(iv)
16 # -------------------------------------------------------------------
17 # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
18 # Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
19 # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
20 # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
21 # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
22 # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
24 # (i) multi-block CBC encrypt with 128-bit key;
25 # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
26 # because of lower AES-NI instruction throughput, nor is there
27 # AES-NI-SHA256 stitch for these processors;
28 # (iii) "this" is for n=8, when we gather twice as much data, result
29 # for n=4 is 20.3+4.44=24.7;
30 # (iv) presented improvement coefficients are asymptotic limits and
31 # in real-life application are somewhat lower, e.g. for 2KB
32 # fragments they range from 75% to 130% (on Haswell);
34 $flavour = shift;
35 $output = shift;
36 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
38 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
40 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
41 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
42 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
43 die "can't locate x86_64-xlate.pl";
45 $avx=0;
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49 $avx = ($1>=2.19) + ($1>=2.22);
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.09) + ($1>=2.10);
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59 $avx = ($1>=10) + ($1>=11);
62 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
63 $avx = ($2>=3.0) + ($2>3.0);
66 open OUT,"| \"$^X\" $xlate $flavour $output";
67 *STDOUT=*OUT;
69 # void sha256_multi_block (
70 # struct { unsigned int A[8];
71 # unsigned int B[8];
72 # unsigned int C[8];
73 # unsigned int D[8];
74 # unsigned int E[8];
75 # unsigned int F[8];
76 # unsigned int G[8];
77 # unsigned int H[8]; } *ctx,
78 # struct { void *ptr; int blocks; } inp[8],
79 # int num); /* 1 or 2 */
81 $ctx="%rdi"; # 1st arg
82 $inp="%rsi"; # 2nd arg
83 $num="%edx"; # 3rd arg
84 @ptr=map("%r$_",(8..11));
85 $Tbl="%rbp";
87 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
88 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
90 $REG_SZ=16;
92 sub Xi_off {
93 my $off = shift;
95 $off %= 16; $off *= $REG_SZ;
96 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
99 sub ROUND_00_15 {
100 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
102 $code.=<<___ if ($i<15);
103 movd `4*$i`(@ptr[0]),$Xi
104 movd `4*$i`(@ptr[1]),$t1
105 movd `4*$i`(@ptr[2]),$t2
106 movd `4*$i`(@ptr[3]),$t3
107 punpckldq $t2,$Xi
108 punpckldq $t3,$t1
109 punpckldq $t1,$Xi
111 $code.=<<___ if ($i==15);
112 movd `4*$i`(@ptr[0]),$Xi
113 lea `16*4`(@ptr[0]),@ptr[0]
114 movd `4*$i`(@ptr[1]),$t1
115 lea `16*4`(@ptr[1]),@ptr[1]
116 movd `4*$i`(@ptr[2]),$t2
117 lea `16*4`(@ptr[2]),@ptr[2]
118 movd `4*$i`(@ptr[3]),$t3
119 lea `16*4`(@ptr[3]),@ptr[3]
120 punpckldq $t2,$Xi
121 punpckldq $t3,$t1
122 punpckldq $t1,$Xi
124 $code.=<<___;
125 movdqa $e,$sigma
126 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
127 movdqa $e,$t3
128 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
129 psrld \$6,$sigma
130 movdqa $e,$t2
131 pslld \$7,$t3
132 movdqa $Xi,`&Xi_off($i)`
133 paddd $h,$Xi # Xi+=h
135 psrld \$11,$t2
136 pxor $t3,$sigma
137 pslld \$21-7,$t3
138 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
139 pxor $t2,$sigma
141 psrld \$25-11,$t2
142 movdqa $e,$t1
143 `"prefetcht0 63(@ptr[0])" if ($i==15)`
144 pxor $t3,$sigma
145 movdqa $e,$axb # borrow $axb
146 pslld \$26-21,$t3
147 pandn $g,$t1
148 pand $f,$axb
149 pxor $t2,$sigma
151 `"prefetcht0 63(@ptr[1])" if ($i==15)`
152 movdqa $a,$t2
153 pxor $t3,$sigma # Sigma1(e)
154 movdqa $a,$t3
155 psrld \$2,$t2
156 paddd $sigma,$Xi # Xi+=Sigma1(e)
157 pxor $axb,$t1 # Ch(e,f,g)
158 movdqa $b,$axb
159 movdqa $a,$sigma
160 pslld \$10,$t3
161 pxor $a,$axb # a^b, b^c in next round
163 `"prefetcht0 63(@ptr[2])" if ($i==15)`
164 psrld \$13,$sigma
165 pxor $t3,$t2
166 paddd $t1,$Xi # Xi+=Ch(e,f,g)
167 pslld \$19-10,$t3
168 pand $axb,$bxc
169 pxor $sigma,$t2
171 `"prefetcht0 63(@ptr[3])" if ($i==15)`
172 psrld \$22-13,$sigma
173 pxor $t3,$t2
174 movdqa $b,$h
175 pslld \$30-19,$t3
176 pxor $t2,$sigma
177 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
178 paddd $Xi,$d # d+=Xi
179 pxor $t3,$sigma # Sigma0(a)
181 paddd $Xi,$h # h+=Xi
182 paddd $sigma,$h # h+=Sigma0(a)
184 $code.=<<___ if (($i%8)==7);
185 lea `32*8`($Tbl),$Tbl
187 ($axb,$bxc)=($bxc,$axb);
190 sub ROUND_16_XX {
191 my $i=shift;
193 $code.=<<___;
194 movdqa `&Xi_off($i+1)`,$Xn
195 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
197 movdqa $Xn,$sigma
198 movdqa $Xn,$t2
199 psrld \$3,$sigma
200 movdqa $Xn,$t3
202 psrld \$7,$t2
203 movdqa `&Xi_off($i+14)`,$t1
204 pslld \$14,$t3
205 pxor $t2,$sigma
206 psrld \$18-7,$t2
207 movdqa $t1,$axb # borrow $axb
208 pxor $t3,$sigma
209 pslld \$25-14,$t3
210 pxor $t2,$sigma
211 psrld \$10,$t1
212 movdqa $axb,$t2
214 psrld \$17,$axb
215 pxor $t3,$sigma # sigma0(X[i+1])
216 pslld \$13,$t2
217 paddd $sigma,$Xi # Xi+=sigma0(e)
218 pxor $axb,$t1
219 psrld \$19-17,$axb
220 pxor $t2,$t1
221 pslld \$15-13,$t2
222 pxor $axb,$t1
223 pxor $t2,$t1 # sigma0(X[i+14])
224 paddd $t1,$Xi # Xi+=sigma1(X[i+14])
226 &ROUND_00_15($i,@_);
227 ($Xi,$Xn)=($Xn,$Xi);
230 $code.=<<___;
231 .text
233 .extern OPENSSL_ia32cap_P
235 .globl sha256_multi_block
236 .type sha256_multi_block,\@function,3
237 .align 32
238 sha256_multi_block:
239 mov OPENSSL_ia32cap_P+4(%rip),%rcx
240 bt \$61,%rcx # check SHA bit
241 jc _shaext_shortcut
243 $code.=<<___ if ($avx);
244 test \$`1<<28`,%ecx
245 jnz _avx_shortcut
247 $code.=<<___;
248 mov %rsp,%rax
249 push %rbx
250 push %rbp
252 $code.=<<___ if ($win64);
253 lea -0xa8(%rsp),%rsp
254 movaps %xmm6,(%rsp)
255 movaps %xmm7,0x10(%rsp)
256 movaps %xmm8,0x20(%rsp)
257 movaps %xmm9,0x30(%rsp)
258 movaps %xmm10,-0x78(%rax)
259 movaps %xmm11,-0x68(%rax)
260 movaps %xmm12,-0x58(%rax)
261 movaps %xmm13,-0x48(%rax)
262 movaps %xmm14,-0x38(%rax)
263 movaps %xmm15,-0x28(%rax)
265 $code.=<<___;
266 sub \$`$REG_SZ*18`, %rsp
267 and \$-256,%rsp
268 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
269 .Lbody:
270 lea K256+128(%rip),$Tbl
271 lea `$REG_SZ*16`(%rsp),%rbx
272 lea 0x80($ctx),$ctx # size optimization
274 .Loop_grande:
275 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
276 xor $num,$num
278 for($i=0;$i<4;$i++) {
279 $code.=<<___;
280 mov `16*$i+0`($inp),@ptr[$i] # input pointer
281 mov `16*$i+8`($inp),%ecx # number of blocks
282 cmp $num,%ecx
283 cmovg %ecx,$num # find maximum
284 test %ecx,%ecx
285 mov %ecx,`4*$i`(%rbx) # initialize counters
286 cmovle $Tbl,@ptr[$i] # cancel input
289 $code.=<<___;
290 test $num,$num
291 jz .Ldone
293 movdqu 0x00-0x80($ctx),$A # load context
294 lea 128(%rsp),%rax
295 movdqu 0x20-0x80($ctx),$B
296 movdqu 0x40-0x80($ctx),$C
297 movdqu 0x60-0x80($ctx),$D
298 movdqu 0x80-0x80($ctx),$E
299 movdqu 0xa0-0x80($ctx),$F
300 movdqu 0xc0-0x80($ctx),$G
301 movdqu 0xe0-0x80($ctx),$H
302 movdqu .Lpbswap(%rip),$Xn
303 jmp .Loop
305 .align 32
306 .Loop:
307 movdqa $C,$bxc
308 pxor $B,$bxc # magic seed
310 for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
311 $code.=<<___;
312 movdqu `&Xi_off($i)`,$Xi
313 mov \$3,%ecx
314 jmp .Loop_16_xx
315 .align 32
316 .Loop_16_xx:
318 for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
319 $code.=<<___;
320 dec %ecx
321 jnz .Loop_16_xx
323 mov \$1,%ecx
324 lea K256+128(%rip),$Tbl
326 movdqa (%rbx),$sigma # pull counters
327 cmp 4*0(%rbx),%ecx # examine counters
328 pxor $t1,$t1
329 cmovge $Tbl,@ptr[0] # cancel input
330 cmp 4*1(%rbx),%ecx
331 movdqa $sigma,$Xn
332 cmovge $Tbl,@ptr[1]
333 cmp 4*2(%rbx),%ecx
334 pcmpgtd $t1,$Xn # mask value
335 cmovge $Tbl,@ptr[2]
336 cmp 4*3(%rbx),%ecx
337 paddd $Xn,$sigma # counters--
338 cmovge $Tbl,@ptr[3]
340 movdqu 0x00-0x80($ctx),$t1
341 pand $Xn,$A
342 movdqu 0x20-0x80($ctx),$t2
343 pand $Xn,$B
344 movdqu 0x40-0x80($ctx),$t3
345 pand $Xn,$C
346 movdqu 0x60-0x80($ctx),$Xi
347 pand $Xn,$D
348 paddd $t1,$A
349 movdqu 0x80-0x80($ctx),$t1
350 pand $Xn,$E
351 paddd $t2,$B
352 movdqu 0xa0-0x80($ctx),$t2
353 pand $Xn,$F
354 paddd $t3,$C
355 movdqu 0xc0-0x80($ctx),$t3
356 pand $Xn,$G
357 paddd $Xi,$D
358 movdqu 0xe0-0x80($ctx),$Xi
359 pand $Xn,$H
360 paddd $t1,$E
361 paddd $t2,$F
362 movdqu $A,0x00-0x80($ctx)
363 paddd $t3,$G
364 movdqu $B,0x20-0x80($ctx)
365 paddd $Xi,$H
366 movdqu $C,0x40-0x80($ctx)
367 movdqu $D,0x60-0x80($ctx)
368 movdqu $E,0x80-0x80($ctx)
369 movdqu $F,0xa0-0x80($ctx)
370 movdqu $G,0xc0-0x80($ctx)
371 movdqu $H,0xe0-0x80($ctx)
373 movdqa $sigma,(%rbx) # save counters
374 movdqa .Lpbswap(%rip),$Xn
375 dec $num
376 jnz .Loop
378 mov `$REG_SZ*17+8`(%rsp),$num
379 lea $REG_SZ($ctx),$ctx
380 lea `16*$REG_SZ/4`($inp),$inp
381 dec $num
382 jnz .Loop_grande
384 .Ldone:
385 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
387 $code.=<<___ if ($win64);
388 movaps -0xb8(%rax),%xmm6
389 movaps -0xa8(%rax),%xmm7
390 movaps -0x98(%rax),%xmm8
391 movaps -0x88(%rax),%xmm9
392 movaps -0x78(%rax),%xmm10
393 movaps -0x68(%rax),%xmm11
394 movaps -0x58(%rax),%xmm12
395 movaps -0x48(%rax),%xmm13
396 movaps -0x38(%rax),%xmm14
397 movaps -0x28(%rax),%xmm15
399 $code.=<<___;
400 mov -16(%rax),%rbp
401 mov -8(%rax),%rbx
402 lea (%rax),%rsp
403 .Lepilogue:
405 .size sha256_multi_block,.-sha256_multi_block
408 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
409 my @MSG0=map("%xmm$_",(4..7));
410 my @MSG1=map("%xmm$_",(8..11));
412 $code.=<<___;
413 .type sha256_multi_block_shaext,\@function,3
414 .align 32
415 sha256_multi_block_shaext:
416 _shaext_shortcut:
417 mov %rsp,%rax
418 push %rbx
419 push %rbp
421 $code.=<<___ if ($win64);
422 lea -0xa8(%rsp),%rsp
423 movaps %xmm6,(%rsp)
424 movaps %xmm7,0x10(%rsp)
425 movaps %xmm8,0x20(%rsp)
426 movaps %xmm9,0x30(%rsp)
427 movaps %xmm10,-0x78(%rax)
428 movaps %xmm11,-0x68(%rax)
429 movaps %xmm12,-0x58(%rax)
430 movaps %xmm13,-0x48(%rax)
431 movaps %xmm14,-0x38(%rax)
432 movaps %xmm15,-0x28(%rax)
434 $code.=<<___;
435 sub \$`$REG_SZ*18`,%rsp
436 shl \$1,$num # we process pair at a time
437 and \$-256,%rsp
438 lea 0x80($ctx),$ctx # size optimization
439 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
440 .Lbody_shaext:
441 lea `$REG_SZ*16`(%rsp),%rbx
442 lea K256_shaext+0x80(%rip),$Tbl
444 .Loop_grande_shaext:
445 mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num
446 xor $num,$num
448 for($i=0;$i<2;$i++) {
449 $code.=<<___;
450 mov `16*$i+0`($inp),@ptr[$i] # input pointer
451 mov `16*$i+8`($inp),%ecx # number of blocks
452 cmp $num,%ecx
453 cmovg %ecx,$num # find maximum
454 test %ecx,%ecx
455 mov %ecx,`4*$i`(%rbx) # initialize counters
456 cmovle %rsp,@ptr[$i] # cancel input
459 $code.=<<___;
460 test $num,$num
461 jz .Ldone_shaext
463 movq 0x00-0x80($ctx),$ABEF0 # A1.A0
464 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
465 movq 0x40-0x80($ctx),$CDGH0 # C1.C0
466 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
467 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
468 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
469 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
470 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
472 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
473 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
474 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
475 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
476 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
478 movdqa $ABEF0,$ABEF1
479 movdqa $CDGH0,$CDGH1
480 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
481 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
482 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
483 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
485 pshufd \$0b00011011,$ABEF0,$ABEF0
486 pshufd \$0b00011011,$CDGH0,$CDGH0
487 pshufd \$0b00011011,$ABEF1,$ABEF1
488 pshufd \$0b00011011,$CDGH1,$CDGH1
489 jmp .Loop_shaext
491 .align 32
492 .Loop_shaext:
493 movdqu 0x00(@ptr[0]),@MSG0[0]
494 movdqu 0x00(@ptr[1]),@MSG1[0]
495 movdqu 0x10(@ptr[0]),@MSG0[1]
496 movdqu 0x10(@ptr[1]),@MSG1[1]
497 movdqu 0x20(@ptr[0]),@MSG0[2]
498 pshufb $TMPx,@MSG0[0]
499 movdqu 0x20(@ptr[1]),@MSG1[2]
500 pshufb $TMPx,@MSG1[0]
501 movdqu 0x30(@ptr[0]),@MSG0[3]
502 lea 0x40(@ptr[0]),@ptr[0]
503 movdqu 0x30(@ptr[1]),@MSG1[3]
504 lea 0x40(@ptr[1]),@ptr[1]
506 movdqa 0*16-0x80($Tbl),$Wi
507 pshufb $TMPx,@MSG0[1]
508 paddd @MSG0[0],$Wi
509 pxor $ABEF0,@MSG0[0] # black magic
510 movdqa $Wi,$TMP0
511 movdqa 0*16-0x80($Tbl),$TMP1
512 pshufb $TMPx,@MSG1[1]
513 paddd @MSG1[0],$TMP1
514 movdqa $CDGH0,0x50(%rsp) # offload
515 sha256rnds2 $ABEF0,$CDGH0 # 0-3
516 pxor $ABEF1,@MSG1[0] # black magic
517 movdqa $TMP1,$Wi
518 movdqa $CDGH1,0x70(%rsp)
519 sha256rnds2 $ABEF1,$CDGH1 # 0-3
520 pshufd \$0x0e,$TMP0,$Wi
521 pxor $ABEF0,@MSG0[0] # black magic
522 movdqa $ABEF0,0x40(%rsp) # offload
523 sha256rnds2 $CDGH0,$ABEF0
524 pshufd \$0x0e,$TMP1,$Wi
525 pxor $ABEF1,@MSG1[0] # black magic
526 movdqa $ABEF1,0x60(%rsp)
527 movdqa 1*16-0x80($Tbl),$TMP0
528 paddd @MSG0[1],$TMP0
529 pshufb $TMPx,@MSG0[2]
530 sha256rnds2 $CDGH1,$ABEF1
532 movdqa $TMP0,$Wi
533 movdqa 1*16-0x80($Tbl),$TMP1
534 paddd @MSG1[1],$TMP1
535 sha256rnds2 $ABEF0,$CDGH0 # 4-7
536 movdqa $TMP1,$Wi
537 prefetcht0 127(@ptr[0])
538 pshufb $TMPx,@MSG0[3]
539 pshufb $TMPx,@MSG1[2]
540 prefetcht0 127(@ptr[1])
541 sha256rnds2 $ABEF1,$CDGH1 # 4-7
542 pshufd \$0x0e,$TMP0,$Wi
543 pshufb $TMPx,@MSG1[3]
544 sha256msg1 @MSG0[1],@MSG0[0]
545 sha256rnds2 $CDGH0,$ABEF0
546 pshufd \$0x0e,$TMP1,$Wi
547 movdqa 2*16-0x80($Tbl),$TMP0
548 paddd @MSG0[2],$TMP0
549 sha256rnds2 $CDGH1,$ABEF1
551 movdqa $TMP0,$Wi
552 movdqa 2*16-0x80($Tbl),$TMP1
553 paddd @MSG1[2],$TMP1
554 sha256rnds2 $ABEF0,$CDGH0 # 8-11
555 sha256msg1 @MSG1[1],@MSG1[0]
556 movdqa $TMP1,$Wi
557 movdqa @MSG0[3],$TMPx
558 sha256rnds2 $ABEF1,$CDGH1 # 8-11
559 pshufd \$0x0e,$TMP0,$Wi
560 palignr \$4,@MSG0[2],$TMPx
561 paddd $TMPx,@MSG0[0]
562 movdqa @MSG1[3],$TMPx
563 palignr \$4,@MSG1[2],$TMPx
564 sha256msg1 @MSG0[2],@MSG0[1]
565 sha256rnds2 $CDGH0,$ABEF0
566 pshufd \$0x0e,$TMP1,$Wi
567 movdqa 3*16-0x80($Tbl),$TMP0
568 paddd @MSG0[3],$TMP0
569 sha256rnds2 $CDGH1,$ABEF1
570 sha256msg1 @MSG1[2],@MSG1[1]
572 movdqa $TMP0,$Wi
573 movdqa 3*16-0x80($Tbl),$TMP1
574 paddd $TMPx,@MSG1[0]
575 paddd @MSG1[3],$TMP1
576 sha256msg2 @MSG0[3],@MSG0[0]
577 sha256rnds2 $ABEF0,$CDGH0 # 12-15
578 movdqa $TMP1,$Wi
579 movdqa @MSG0[0],$TMPx
580 palignr \$4,@MSG0[3],$TMPx
581 sha256rnds2 $ABEF1,$CDGH1 # 12-15
582 sha256msg2 @MSG1[3],@MSG1[0]
583 pshufd \$0x0e,$TMP0,$Wi
584 paddd $TMPx,@MSG0[1]
585 movdqa @MSG1[0],$TMPx
586 palignr \$4,@MSG1[3],$TMPx
587 sha256msg1 @MSG0[3],@MSG0[2]
588 sha256rnds2 $CDGH0,$ABEF0
589 pshufd \$0x0e,$TMP1,$Wi
590 movdqa 4*16-0x80($Tbl),$TMP0
591 paddd @MSG0[0],$TMP0
592 sha256rnds2 $CDGH1,$ABEF1
593 sha256msg1 @MSG1[3],@MSG1[2]
595 for($i=4;$i<16-3;$i++) {
596 $code.=<<___;
597 movdqa $TMP0,$Wi
598 movdqa $i*16-0x80($Tbl),$TMP1
599 paddd $TMPx,@MSG1[1]
600 paddd @MSG1[0],$TMP1
601 sha256msg2 @MSG0[0],@MSG0[1]
602 sha256rnds2 $ABEF0,$CDGH0 # 16-19...
603 movdqa $TMP1,$Wi
604 movdqa @MSG0[1],$TMPx
605 palignr \$4,@MSG0[0],$TMPx
606 sha256rnds2 $ABEF1,$CDGH1 # 16-19...
607 sha256msg2 @MSG1[0],@MSG1[1]
608 pshufd \$0x0e,$TMP0,$Wi
609 paddd $TMPx,@MSG0[2]
610 movdqa @MSG1[1],$TMPx
611 palignr \$4,@MSG1[0],$TMPx
612 sha256msg1 @MSG0[0],@MSG0[3]
613 sha256rnds2 $CDGH0,$ABEF0
614 pshufd \$0x0e,$TMP1,$Wi
615 movdqa `($i+1)*16`-0x80($Tbl),$TMP0
616 paddd @MSG0[1],$TMP0
617 sha256rnds2 $CDGH1,$ABEF1
618 sha256msg1 @MSG1[0],@MSG1[3]
620 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
622 $code.=<<___;
623 movdqa $TMP0,$Wi
624 movdqa 13*16-0x80($Tbl),$TMP1
625 paddd $TMPx,@MSG1[1]
626 paddd @MSG1[0],$TMP1
627 sha256msg2 @MSG0[0],@MSG0[1]
628 sha256rnds2 $ABEF0,$CDGH0 # 52-55
629 movdqa $TMP1,$Wi
630 movdqa @MSG0[1],$TMPx
631 palignr \$4,@MSG0[0],$TMPx
632 sha256rnds2 $ABEF1,$CDGH1 # 52-55
633 sha256msg2 @MSG1[0],@MSG1[1]
634 pshufd \$0x0e,$TMP0,$Wi
635 paddd $TMPx,@MSG0[2]
636 movdqa @MSG1[1],$TMPx
637 palignr \$4,@MSG1[0],$TMPx
639 sha256rnds2 $CDGH0,$ABEF0
640 pshufd \$0x0e,$TMP1,$Wi
641 movdqa 14*16-0x80($Tbl),$TMP0
642 paddd @MSG0[1],$TMP0
643 sha256rnds2 $CDGH1,$ABEF1
645 movdqa $TMP0,$Wi
646 movdqa 14*16-0x80($Tbl),$TMP1
647 paddd $TMPx,@MSG1[2]
648 paddd @MSG1[1],$TMP1
649 sha256msg2 @MSG0[1],@MSG0[2]
651 sha256rnds2 $ABEF0,$CDGH0 # 56-59
652 movdqa $TMP1,$Wi
653 mov \$1,%ecx
654 pxor @MSG0[1],@MSG0[1] # zero
655 sha256rnds2 $ABEF1,$CDGH1 # 56-59
656 sha256msg2 @MSG1[1],@MSG1[2]
657 pshufd \$0x0e,$TMP0,$Wi
658 movdqa 15*16-0x80($Tbl),$TMP0
659 paddd @MSG0[2],$TMP0
660 movq (%rbx),@MSG0[2] # pull counters
662 sha256rnds2 $CDGH0,$ABEF0
663 pshufd \$0x0e,$TMP1,$Wi
664 movdqa 15*16-0x80($Tbl),$TMP1
665 paddd @MSG1[2],$TMP1
666 sha256rnds2 $CDGH1,$ABEF1
668 movdqa $TMP0,$Wi
669 cmp 4*0(%rbx),%ecx # examine counters
670 cmovge %rsp,@ptr[0] # cancel input
671 cmp 4*1(%rbx),%ecx
672 cmovge %rsp,@ptr[1]
673 pshufd \$0x00,@MSG0[2],@MSG1[0]
674 sha256rnds2 $ABEF0,$CDGH0 # 60-63
675 movdqa $TMP1,$Wi
676 pshufd \$0x55,@MSG0[2],@MSG1[1]
677 movdqa @MSG0[2],@MSG1[2]
678 sha256rnds2 $ABEF1,$CDGH1 # 60-63
679 pshufd \$0x0e,$TMP0,$Wi
680 pcmpgtd @MSG0[1],@MSG1[0]
681 pcmpgtd @MSG0[1],@MSG1[1]
682 sha256rnds2 $CDGH0,$ABEF0
683 pshufd \$0x0e,$TMP1,$Wi
684 pcmpgtd @MSG0[1],@MSG1[2] # counter mask
685 movdqa K256_shaext-0x10(%rip),$TMPx
686 sha256rnds2 $CDGH1,$ABEF1
688 pand @MSG1[0],$CDGH0
689 pand @MSG1[1],$CDGH1
690 pand @MSG1[0],$ABEF0
691 pand @MSG1[1],$ABEF1
692 paddd @MSG0[2],@MSG1[2] # counters--
694 paddd 0x50(%rsp),$CDGH0
695 paddd 0x70(%rsp),$CDGH1
696 paddd 0x40(%rsp),$ABEF0
697 paddd 0x60(%rsp),$ABEF1
699 movq @MSG1[2],(%rbx) # save counters
700 dec $num
701 jnz .Loop_shaext
703 mov `$REG_SZ*17+8`(%rsp),$num
705 pshufd \$0b00011011,$ABEF0,$ABEF0
706 pshufd \$0b00011011,$CDGH0,$CDGH0
707 pshufd \$0b00011011,$ABEF1,$ABEF1
708 pshufd \$0b00011011,$CDGH1,$CDGH1
710 movdqa $ABEF0,@MSG0[0]
711 movdqa $CDGH0,@MSG0[1]
712 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
713 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
714 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
715 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
717 movq $ABEF0,0x00-0x80($ctx) # A1.A0
718 psrldq \$8,$ABEF0
719 movq @MSG0[0],0x80-0x80($ctx) # E1.E0
720 psrldq \$8,@MSG0[0]
721 movq $ABEF0,0x20-0x80($ctx) # B1.B0
722 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
724 movq $CDGH0,0x40-0x80($ctx) # C1.C0
725 psrldq \$8,$CDGH0
726 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
727 psrldq \$8,@MSG0[1]
728 movq $CDGH0,0x60-0x80($ctx) # D1.D0
729 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
731 lea `$REG_SZ/2`($ctx),$ctx
732 lea `16*2`($inp),$inp
733 dec $num
734 jnz .Loop_grande_shaext
736 .Ldone_shaext:
737 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
739 $code.=<<___ if ($win64);
740 movaps -0xb8(%rax),%xmm6
741 movaps -0xa8(%rax),%xmm7
742 movaps -0x98(%rax),%xmm8
743 movaps -0x88(%rax),%xmm9
744 movaps -0x78(%rax),%xmm10
745 movaps -0x68(%rax),%xmm11
746 movaps -0x58(%rax),%xmm12
747 movaps -0x48(%rax),%xmm13
748 movaps -0x38(%rax),%xmm14
749 movaps -0x28(%rax),%xmm15
751 $code.=<<___;
752 mov -16(%rax),%rbp
753 mov -8(%rax),%rbx
754 lea (%rax),%rsp
755 .Lepilogue_shaext:
757 .size sha256_multi_block_shaext,.-sha256_multi_block_shaext
760 if ($avx) {{{
761 sub ROUND_00_15_avx {
762 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
764 $code.=<<___ if ($i<15 && $REG_SZ==16);
765 vmovd `4*$i`(@ptr[0]),$Xi
766 vmovd `4*$i`(@ptr[1]),$t1
767 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
768 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
769 vpunpckldq $t1,$Xi,$Xi
770 vpshufb $Xn,$Xi,$Xi
772 $code.=<<___ if ($i==15 && $REG_SZ==16);
773 vmovd `4*$i`(@ptr[0]),$Xi
774 lea `16*4`(@ptr[0]),@ptr[0]
775 vmovd `4*$i`(@ptr[1]),$t1
776 lea `16*4`(@ptr[1]),@ptr[1]
777 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
778 lea `16*4`(@ptr[2]),@ptr[2]
779 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
780 lea `16*4`(@ptr[3]),@ptr[3]
781 vpunpckldq $t1,$Xi,$Xi
782 vpshufb $Xn,$Xi,$Xi
784 $code.=<<___ if ($i<15 && $REG_SZ==32);
785 vmovd `4*$i`(@ptr[0]),$Xi
786 vmovd `4*$i`(@ptr[4]),$t1
787 vmovd `4*$i`(@ptr[1]),$t2
788 vmovd `4*$i`(@ptr[5]),$t3
789 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
790 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
791 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
792 vpunpckldq $t2,$Xi,$Xi
793 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
794 vpunpckldq $t3,$t1,$t1
795 vinserti128 $t1,$Xi,$Xi
796 vpshufb $Xn,$Xi,$Xi
798 $code.=<<___ if ($i==15 && $REG_SZ==32);
799 vmovd `4*$i`(@ptr[0]),$Xi
800 lea `16*4`(@ptr[0]),@ptr[0]
801 vmovd `4*$i`(@ptr[4]),$t1
802 lea `16*4`(@ptr[4]),@ptr[4]
803 vmovd `4*$i`(@ptr[1]),$t2
804 lea `16*4`(@ptr[1]),@ptr[1]
805 vmovd `4*$i`(@ptr[5]),$t3
806 lea `16*4`(@ptr[5]),@ptr[5]
807 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
808 lea `16*4`(@ptr[2]),@ptr[2]
809 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
810 lea `16*4`(@ptr[6]),@ptr[6]
811 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
812 lea `16*4`(@ptr[3]),@ptr[3]
813 vpunpckldq $t2,$Xi,$Xi
814 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
815 lea `16*4`(@ptr[7]),@ptr[7]
816 vpunpckldq $t3,$t1,$t1
817 vinserti128 $t1,$Xi,$Xi
818 vpshufb $Xn,$Xi,$Xi
820 $code.=<<___;
821 vpsrld \$6,$e,$sigma
822 vpslld \$26,$e,$t3
823 vmovdqu $Xi,`&Xi_off($i)`
824 vpaddd $h,$Xi,$Xi # Xi+=h
826 vpsrld \$11,$e,$t2
827 vpxor $t3,$sigma,$sigma
828 vpslld \$21,$e,$t3
829 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
830 vpxor $t2,$sigma,$sigma
832 vpsrld \$25,$e,$t2
833 vpxor $t3,$sigma,$sigma
834 `"prefetcht0 63(@ptr[0])" if ($i==15)`
835 vpslld \$7,$e,$t3
836 vpandn $g,$e,$t1
837 vpand $f,$e,$axb # borrow $axb
838 `"prefetcht0 63(@ptr[1])" if ($i==15)`
839 vpxor $t2,$sigma,$sigma
841 vpsrld \$2,$a,$h # borrow $h
842 vpxor $t3,$sigma,$sigma # Sigma1(e)
843 `"prefetcht0 63(@ptr[2])" if ($i==15)`
844 vpslld \$30,$a,$t2
845 vpxor $axb,$t1,$t1 # Ch(e,f,g)
846 vpxor $a,$b,$axb # a^b, b^c in next round
847 `"prefetcht0 63(@ptr[3])" if ($i==15)`
848 vpxor $t2,$h,$h
849 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
851 vpsrld \$13,$a,$t2
852 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
853 vpslld \$19,$a,$t3
854 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
855 vpand $axb,$bxc,$bxc
856 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
857 vpxor $t2,$h,$sigma
859 vpsrld \$22,$a,$t2
860 vpxor $t3,$sigma,$sigma
861 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
862 vpslld \$10,$a,$t3
863 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
864 vpaddd $Xi,$d,$d # d+=Xi
865 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
866 vpxor $t2,$sigma,$sigma
867 vpxor $t3,$sigma,$sigma # Sigma0(a)
869 vpaddd $Xi,$h,$h # h+=Xi
870 vpaddd $sigma,$h,$h # h+=Sigma0(a)
872 $code.=<<___ if (($i%8)==7);
873 add \$`32*8`,$Tbl
875 ($axb,$bxc)=($bxc,$axb);
878 sub ROUND_16_XX_avx {
879 my $i=shift;
881 $code.=<<___;
882 vmovdqu `&Xi_off($i+1)`,$Xn
883 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
885 vpsrld \$3,$Xn,$sigma
886 vpsrld \$7,$Xn,$t2
887 vpslld \$25,$Xn,$t3
888 vpxor $t2,$sigma,$sigma
889 vpsrld \$18,$Xn,$t2
890 vpxor $t3,$sigma,$sigma
891 vpslld \$14,$Xn,$t3
892 vmovdqu `&Xi_off($i+14)`,$t1
893 vpsrld \$10,$t1,$axb # borrow $axb
895 vpxor $t2,$sigma,$sigma
896 vpsrld \$17,$t1,$t2
897 vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
898 vpslld \$15,$t1,$t3
899 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
900 vpxor $t2,$axb,$sigma
901 vpsrld \$19,$t1,$t2
902 vpxor $t3,$sigma,$sigma
903 vpslld \$13,$t1,$t3
904 vpxor $t2,$sigma,$sigma
905 vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
906 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
908 &ROUND_00_15_avx($i,@_);
909 ($Xi,$Xn)=($Xn,$Xi);
912 $code.=<<___;
913 .type sha256_multi_block_avx,\@function,3
914 .align 32
915 sha256_multi_block_avx:
916 _avx_shortcut:
918 $code.=<<___ if ($avx>1);
919 shr \$32,%rcx
920 cmp \$2,$num
921 jb .Lavx
922 test \$`1<<5`,%ecx
923 jnz _avx2_shortcut
924 jmp .Lavx
925 .align 32
926 .Lavx:
928 $code.=<<___;
929 mov %rsp,%rax
930 push %rbx
931 push %rbp
933 $code.=<<___ if ($win64);
934 lea -0xa8(%rsp),%rsp
935 movaps %xmm6,(%rsp)
936 movaps %xmm7,0x10(%rsp)
937 movaps %xmm8,0x20(%rsp)
938 movaps %xmm9,0x30(%rsp)
939 movaps %xmm10,-0x78(%rax)
940 movaps %xmm11,-0x68(%rax)
941 movaps %xmm12,-0x58(%rax)
942 movaps %xmm13,-0x48(%rax)
943 movaps %xmm14,-0x38(%rax)
944 movaps %xmm15,-0x28(%rax)
946 $code.=<<___;
947 sub \$`$REG_SZ*18`, %rsp
948 and \$-256,%rsp
949 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
950 .Lbody_avx:
951 lea K256+128(%rip),$Tbl
952 lea `$REG_SZ*16`(%rsp),%rbx
953 lea 0x80($ctx),$ctx # size optimization
955 .Loop_grande_avx:
956 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
957 xor $num,$num
959 for($i=0;$i<4;$i++) {
960 $code.=<<___;
961 mov `16*$i+0`($inp),@ptr[$i] # input pointer
962 mov `16*$i+8`($inp),%ecx # number of blocks
963 cmp $num,%ecx
964 cmovg %ecx,$num # find maximum
965 test %ecx,%ecx
966 mov %ecx,`4*$i`(%rbx) # initialize counters
967 cmovle $Tbl,@ptr[$i] # cancel input
970 $code.=<<___;
971 test $num,$num
972 jz .Ldone_avx
974 vmovdqu 0x00-0x80($ctx),$A # load context
975 lea 128(%rsp),%rax
976 vmovdqu 0x20-0x80($ctx),$B
977 vmovdqu 0x40-0x80($ctx),$C
978 vmovdqu 0x60-0x80($ctx),$D
979 vmovdqu 0x80-0x80($ctx),$E
980 vmovdqu 0xa0-0x80($ctx),$F
981 vmovdqu 0xc0-0x80($ctx),$G
982 vmovdqu 0xe0-0x80($ctx),$H
983 vmovdqu .Lpbswap(%rip),$Xn
984 jmp .Loop_avx
986 .align 32
987 .Loop_avx:
988 vpxor $B,$C,$bxc # magic seed
990 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
991 $code.=<<___;
992 vmovdqu `&Xi_off($i)`,$Xi
993 mov \$3,%ecx
994 jmp .Loop_16_xx_avx
995 .align 32
996 .Loop_16_xx_avx:
998 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
999 $code.=<<___;
1000 dec %ecx
1001 jnz .Loop_16_xx_avx
1003 mov \$1,%ecx
1004 lea K256+128(%rip),$Tbl
1006 for($i=0;$i<4;$i++) {
1007 $code.=<<___;
1008 cmp `4*$i`(%rbx),%ecx # examine counters
1009 cmovge $Tbl,@ptr[$i] # cancel input
1012 $code.=<<___;
1013 vmovdqa (%rbx),$sigma # pull counters
1014 vpxor $t1,$t1,$t1
1015 vmovdqa $sigma,$Xn
1016 vpcmpgtd $t1,$Xn,$Xn # mask value
1017 vpaddd $Xn,$sigma,$sigma # counters--
1019 vmovdqu 0x00-0x80($ctx),$t1
1020 vpand $Xn,$A,$A
1021 vmovdqu 0x20-0x80($ctx),$t2
1022 vpand $Xn,$B,$B
1023 vmovdqu 0x40-0x80($ctx),$t3
1024 vpand $Xn,$C,$C
1025 vmovdqu 0x60-0x80($ctx),$Xi
1026 vpand $Xn,$D,$D
1027 vpaddd $t1,$A,$A
1028 vmovdqu 0x80-0x80($ctx),$t1
1029 vpand $Xn,$E,$E
1030 vpaddd $t2,$B,$B
1031 vmovdqu 0xa0-0x80($ctx),$t2
1032 vpand $Xn,$F,$F
1033 vpaddd $t3,$C,$C
1034 vmovdqu 0xc0-0x80($ctx),$t3
1035 vpand $Xn,$G,$G
1036 vpaddd $Xi,$D,$D
1037 vmovdqu 0xe0-0x80($ctx),$Xi
1038 vpand $Xn,$H,$H
1039 vpaddd $t1,$E,$E
1040 vpaddd $t2,$F,$F
1041 vmovdqu $A,0x00-0x80($ctx)
1042 vpaddd $t3,$G,$G
1043 vmovdqu $B,0x20-0x80($ctx)
1044 vpaddd $Xi,$H,$H
1045 vmovdqu $C,0x40-0x80($ctx)
1046 vmovdqu $D,0x60-0x80($ctx)
1047 vmovdqu $E,0x80-0x80($ctx)
1048 vmovdqu $F,0xa0-0x80($ctx)
1049 vmovdqu $G,0xc0-0x80($ctx)
1050 vmovdqu $H,0xe0-0x80($ctx)
1052 vmovdqu $sigma,(%rbx) # save counters
1053 vmovdqu .Lpbswap(%rip),$Xn
1054 dec $num
1055 jnz .Loop_avx
1057 mov `$REG_SZ*17+8`(%rsp),$num
1058 lea $REG_SZ($ctx),$ctx
1059 lea `16*$REG_SZ/4`($inp),$inp
1060 dec $num
1061 jnz .Loop_grande_avx
1063 .Ldone_avx:
1064 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
1065 vzeroupper
1067 $code.=<<___ if ($win64);
1068 movaps -0xb8(%rax),%xmm6
1069 movaps -0xa8(%rax),%xmm7
1070 movaps -0x98(%rax),%xmm8
1071 movaps -0x88(%rax),%xmm9
1072 movaps -0x78(%rax),%xmm10
1073 movaps -0x68(%rax),%xmm11
1074 movaps -0x58(%rax),%xmm12
1075 movaps -0x48(%rax),%xmm13
1076 movaps -0x38(%rax),%xmm14
1077 movaps -0x28(%rax),%xmm15
1079 $code.=<<___;
1080 mov -16(%rax),%rbp
1081 mov -8(%rax),%rbx
1082 lea (%rax),%rsp
1083 .Lepilogue_avx:
1085 .size sha256_multi_block_avx,.-sha256_multi_block_avx
1087 if ($avx>1) {
1088 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1090 $REG_SZ=32;
1091 @ptr=map("%r$_",(12..15,8..11));
1093 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1094 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1096 $code.=<<___;
1097 .type sha256_multi_block_avx2,\@function,3
1098 .align 32
1099 sha256_multi_block_avx2:
1100 _avx2_shortcut:
1101 mov %rsp,%rax
1102 push %rbx
1103 push %rbp
1104 push %r12
1105 push %r13
1106 push %r14
1107 push %r15
1109 $code.=<<___ if ($win64);
1110 lea -0xa8(%rsp),%rsp
1111 movaps %xmm6,(%rsp)
1112 movaps %xmm7,0x10(%rsp)
1113 movaps %xmm8,0x20(%rsp)
1114 movaps %xmm9,0x30(%rsp)
1115 movaps %xmm10,0x40(%rsp)
1116 movaps %xmm11,0x50(%rsp)
1117 movaps %xmm12,-0x78(%rax)
1118 movaps %xmm13,-0x68(%rax)
1119 movaps %xmm14,-0x58(%rax)
1120 movaps %xmm15,-0x48(%rax)
1122 $code.=<<___;
1123 sub \$`$REG_SZ*18`, %rsp
1124 and \$-256,%rsp
1125 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
1126 .Lbody_avx2:
1127 lea K256+128(%rip),$Tbl
1128 lea 0x80($ctx),$ctx # size optimization
1130 .Loop_grande_avx2:
1131 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
1132 xor $num,$num
1133 lea `$REG_SZ*16`(%rsp),%rbx
1135 for($i=0;$i<8;$i++) {
1136 $code.=<<___;
1137 mov `16*$i+0`($inp),@ptr[$i] # input pointer
1138 mov `16*$i+8`($inp),%ecx # number of blocks
1139 cmp $num,%ecx
1140 cmovg %ecx,$num # find maximum
1141 test %ecx,%ecx
1142 mov %ecx,`4*$i`(%rbx) # initialize counters
1143 cmovle $Tbl,@ptr[$i] # cancel input
1146 $code.=<<___;
1147 vmovdqu 0x00-0x80($ctx),$A # load context
1148 lea 128(%rsp),%rax
1149 vmovdqu 0x20-0x80($ctx),$B
1150 lea 256+128(%rsp),%rbx
1151 vmovdqu 0x40-0x80($ctx),$C
1152 vmovdqu 0x60-0x80($ctx),$D
1153 vmovdqu 0x80-0x80($ctx),$E
1154 vmovdqu 0xa0-0x80($ctx),$F
1155 vmovdqu 0xc0-0x80($ctx),$G
1156 vmovdqu 0xe0-0x80($ctx),$H
1157 vmovdqu .Lpbswap(%rip),$Xn
1158 jmp .Loop_avx2
1160 .align 32
1161 .Loop_avx2:
1162 vpxor $B,$C,$bxc # magic seed
1164 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1165 $code.=<<___;
1166 vmovdqu `&Xi_off($i)`,$Xi
1167 mov \$3,%ecx
1168 jmp .Loop_16_xx_avx2
1169 .align 32
1170 .Loop_16_xx_avx2:
1172 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1173 $code.=<<___;
1174 dec %ecx
1175 jnz .Loop_16_xx_avx2
1177 mov \$1,%ecx
1178 lea `$REG_SZ*16`(%rsp),%rbx
1179 lea K256+128(%rip),$Tbl
1181 for($i=0;$i<8;$i++) {
1182 $code.=<<___;
1183 cmp `4*$i`(%rbx),%ecx # examine counters
1184 cmovge $Tbl,@ptr[$i] # cancel input
1187 $code.=<<___;
1188 vmovdqa (%rbx),$sigma # pull counters
1189 vpxor $t1,$t1,$t1
1190 vmovdqa $sigma,$Xn
1191 vpcmpgtd $t1,$Xn,$Xn # mask value
1192 vpaddd $Xn,$sigma,$sigma # counters--
1194 vmovdqu 0x00-0x80($ctx),$t1
1195 vpand $Xn,$A,$A
1196 vmovdqu 0x20-0x80($ctx),$t2
1197 vpand $Xn,$B,$B
1198 vmovdqu 0x40-0x80($ctx),$t3
1199 vpand $Xn,$C,$C
1200 vmovdqu 0x60-0x80($ctx),$Xi
1201 vpand $Xn,$D,$D
1202 vpaddd $t1,$A,$A
1203 vmovdqu 0x80-0x80($ctx),$t1
1204 vpand $Xn,$E,$E
1205 vpaddd $t2,$B,$B
1206 vmovdqu 0xa0-0x80($ctx),$t2
1207 vpand $Xn,$F,$F
1208 vpaddd $t3,$C,$C
1209 vmovdqu 0xc0-0x80($ctx),$t3
1210 vpand $Xn,$G,$G
1211 vpaddd $Xi,$D,$D
1212 vmovdqu 0xe0-0x80($ctx),$Xi
1213 vpand $Xn,$H,$H
1214 vpaddd $t1,$E,$E
1215 vpaddd $t2,$F,$F
1216 vmovdqu $A,0x00-0x80($ctx)
1217 vpaddd $t3,$G,$G
1218 vmovdqu $B,0x20-0x80($ctx)
1219 vpaddd $Xi,$H,$H
1220 vmovdqu $C,0x40-0x80($ctx)
1221 vmovdqu $D,0x60-0x80($ctx)
1222 vmovdqu $E,0x80-0x80($ctx)
1223 vmovdqu $F,0xa0-0x80($ctx)
1224 vmovdqu $G,0xc0-0x80($ctx)
1225 vmovdqu $H,0xe0-0x80($ctx)
1227 vmovdqu $sigma,(%rbx) # save counters
1228 lea 256+128(%rsp),%rbx
1229 vmovdqu .Lpbswap(%rip),$Xn
1230 dec $num
1231 jnz .Loop_avx2
1233 #mov `$REG_SZ*17+8`(%rsp),$num
1234 #lea $REG_SZ($ctx),$ctx
1235 #lea `16*$REG_SZ/4`($inp),$inp
1236 #dec $num
1237 #jnz .Loop_grande_avx2
1239 .Ldone_avx2:
1240 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
1241 vzeroupper
1243 $code.=<<___ if ($win64);
1244 movaps -0xd8(%rax),%xmm6
1245 movaps -0xc8(%rax),%xmm7
1246 movaps -0xb8(%rax),%xmm8
1247 movaps -0xa8(%rax),%xmm9
1248 movaps -0x98(%rax),%xmm10
1249 movaps -0x88(%rax),%xmm11
1250 movaps -0x78(%rax),%xmm12
1251 movaps -0x68(%rax),%xmm13
1252 movaps -0x58(%rax),%xmm14
1253 movaps -0x48(%rax),%xmm15
1255 $code.=<<___;
1256 mov -48(%rax),%r15
1257 mov -40(%rax),%r14
1258 mov -32(%rax),%r13
1259 mov -24(%rax),%r12
1260 mov -16(%rax),%rbp
1261 mov -8(%rax),%rbx
1262 lea (%rax),%rsp
1263 .Lepilogue_avx2:
1265 .size sha256_multi_block_avx2,.-sha256_multi_block_avx2
1267 } }}}
1268 $code.=<<___;
1269 .align 256
1270 K256:
1272 sub TABLE {
1273 foreach (@_) {
1274 $code.=<<___;
1275 .long $_,$_,$_,$_
1276 .long $_,$_,$_,$_
1280 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1281 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1282 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1283 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1284 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1285 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1286 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1287 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1288 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1289 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1290 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1291 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1292 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1293 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1294 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1295 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1296 $code.=<<___;
1297 .Lpbswap:
1298 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1299 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1300 K256_shaext:
1301 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1302 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1303 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1304 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1305 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1306 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1307 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1308 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1309 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1310 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1311 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1312 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1313 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1314 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1315 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1316 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1317 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1320 if ($win64) {
1321 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1322 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1323 $rec="%rcx";
1324 $frame="%rdx";
1325 $context="%r8";
1326 $disp="%r9";
1328 $code.=<<___;
1329 .extern __imp_RtlVirtualUnwind
1330 .type se_handler,\@abi-omnipotent
1331 .align 16
1332 se_handler:
1333 push %rsi
1334 push %rdi
1335 push %rbx
1336 push %rbp
1337 push %r12
1338 push %r13
1339 push %r14
1340 push %r15
1341 pushfq
1342 sub \$64,%rsp
1344 mov 120($context),%rax # pull context->Rax
1345 mov 248($context),%rbx # pull context->Rip
1347 mov 8($disp),%rsi # disp->ImageBase
1348 mov 56($disp),%r11 # disp->HandlerData
1350 mov 0(%r11),%r10d # HandlerData[0]
1351 lea (%rsi,%r10),%r10 # end of prologue label
1352 cmp %r10,%rbx # context->Rip<.Lbody
1353 jb .Lin_prologue
1355 mov 152($context),%rax # pull context->Rsp
1357 mov 4(%r11),%r10d # HandlerData[1]
1358 lea (%rsi,%r10),%r10 # epilogue label
1359 cmp %r10,%rbx # context->Rip>=.Lepilogue
1360 jae .Lin_prologue
1362 mov `16*17`(%rax),%rax # pull saved stack pointer
1364 mov -8(%rax),%rbx
1365 mov -16(%rax),%rbp
1366 mov %rbx,144($context) # restore context->Rbx
1367 mov %rbp,160($context) # restore context->Rbp
1369 lea -24-10*16(%rax),%rsi
1370 lea 512($context),%rdi # &context.Xmm6
1371 mov \$20,%ecx
1372 .long 0xa548f3fc # cld; rep movsq
1374 .Lin_prologue:
1375 mov 8(%rax),%rdi
1376 mov 16(%rax),%rsi
1377 mov %rax,152($context) # restore context->Rsp
1378 mov %rsi,168($context) # restore context->Rsi
1379 mov %rdi,176($context) # restore context->Rdi
1381 mov 40($disp),%rdi # disp->ContextRecord
1382 mov $context,%rsi # context
1383 mov \$154,%ecx # sizeof(CONTEXT)
1384 .long 0xa548f3fc # cld; rep movsq
1386 mov $disp,%rsi
1387 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1388 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1389 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1390 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1391 mov 40(%rsi),%r10 # disp->ContextRecord
1392 lea 56(%rsi),%r11 # &disp->HandlerData
1393 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1394 mov %r10,32(%rsp) # arg5
1395 mov %r11,40(%rsp) # arg6
1396 mov %r12,48(%rsp) # arg7
1397 mov %rcx,56(%rsp) # arg8, (NULL)
1398 call *__imp_RtlVirtualUnwind(%rip)
1400 mov \$1,%eax # ExceptionContinueSearch
1401 add \$64,%rsp
1402 popfq
1403 pop %r15
1404 pop %r14
1405 pop %r13
1406 pop %r12
1407 pop %rbp
1408 pop %rbx
1409 pop %rdi
1410 pop %rsi
1412 .size se_handler,.-se_handler
1414 $code.=<<___ if ($avx>1);
1415 .type avx2_handler,\@abi-omnipotent
1416 .align 16
1417 avx2_handler:
1418 push %rsi
1419 push %rdi
1420 push %rbx
1421 push %rbp
1422 push %r12
1423 push %r13
1424 push %r14
1425 push %r15
1426 pushfq
1427 sub \$64,%rsp
1429 mov 120($context),%rax # pull context->Rax
1430 mov 248($context),%rbx # pull context->Rip
1432 mov 8($disp),%rsi # disp->ImageBase
1433 mov 56($disp),%r11 # disp->HandlerData
1435 mov 0(%r11),%r10d # HandlerData[0]
1436 lea (%rsi,%r10),%r10 # end of prologue label
1437 cmp %r10,%rbx # context->Rip<body label
1438 jb .Lin_prologue
1440 mov 152($context),%rax # pull context->Rsp
1442 mov 4(%r11),%r10d # HandlerData[1]
1443 lea (%rsi,%r10),%r10 # epilogue label
1444 cmp %r10,%rbx # context->Rip>=epilogue label
1445 jae .Lin_prologue
1447 mov `32*17`($context),%rax # pull saved stack pointer
1449 mov -8(%rax),%rbx
1450 mov -16(%rax),%rbp
1451 mov -24(%rax),%r12
1452 mov -32(%rax),%r13
1453 mov -40(%rax),%r14
1454 mov -48(%rax),%r15
1455 mov %rbx,144($context) # restore context->Rbx
1456 mov %rbp,160($context) # restore context->Rbp
1457 mov %r12,216($context) # restore cotnext->R12
1458 mov %r13,224($context) # restore cotnext->R13
1459 mov %r14,232($context) # restore cotnext->R14
1460 mov %r15,240($context) # restore cotnext->R15
1462 lea -56-10*16(%rax),%rsi
1463 lea 512($context),%rdi # &context.Xmm6
1464 mov \$20,%ecx
1465 .long 0xa548f3fc # cld; rep movsq
1467 jmp .Lin_prologue
1468 .size avx2_handler,.-avx2_handler
1470 $code.=<<___;
1471 .section .pdata
1472 .align 4
1473 .rva .LSEH_begin_sha256_multi_block
1474 .rva .LSEH_end_sha256_multi_block
1475 .rva .LSEH_info_sha256_multi_block
1476 .rva .LSEH_begin_sha256_multi_block_shaext
1477 .rva .LSEH_end_sha256_multi_block_shaext
1478 .rva .LSEH_info_sha256_multi_block_shaext
1480 $code.=<<___ if ($avx);
1481 .rva .LSEH_begin_sha256_multi_block_avx
1482 .rva .LSEH_end_sha256_multi_block_avx
1483 .rva .LSEH_info_sha256_multi_block_avx
1485 $code.=<<___ if ($avx>1);
1486 .rva .LSEH_begin_sha256_multi_block_avx2
1487 .rva .LSEH_end_sha256_multi_block_avx2
1488 .rva .LSEH_info_sha256_multi_block_avx2
1490 $code.=<<___;
1491 .section .xdata
1492 .align 8
1493 .LSEH_info_sha256_multi_block:
1494 .byte 9,0,0,0
1495 .rva se_handler
1496 .rva .Lbody,.Lepilogue # HandlerData[]
1497 .LSEH_info_sha256_multi_block_shaext:
1498 .byte 9,0,0,0
1499 .rva se_handler
1500 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1502 $code.=<<___ if ($avx);
1503 .LSEH_info_sha256_multi_block_avx:
1504 .byte 9,0,0,0
1505 .rva se_handler
1506 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
1508 $code.=<<___ if ($avx>1);
1509 .LSEH_info_sha256_multi_block_avx2:
1510 .byte 9,0,0,0
1511 .rva avx2_handler
1512 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
1515 ####################################################################
1517 sub rex {
1518 local *opcode=shift;
1519 my ($dst,$src)=@_;
1520 my $rex=0;
1522 $rex|=0x04 if ($dst>=8);
1523 $rex|=0x01 if ($src>=8);
1524 unshift @opcode,$rex|0x40 if ($rex);
1527 sub sha256op38 {
1528 my $instr = shift;
1529 my %opcodelet = (
1530 "sha256rnds2" => 0xcb,
1531 "sha256msg1" => 0xcc,
1532 "sha256msg2" => 0xcd );
1534 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1535 my @opcode=(0x0f,0x38);
1536 rex(\@opcode,$2,$1);
1537 push @opcode,$opcodelet{$instr};
1538 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1539 return ".byte\t".join(',',@opcode);
1540 } else {
1541 return $instr."\t".@_[0];
1545 foreach (split("\n",$code)) {
1546 s/\`([^\`]*)\`/eval($1)/ge;
1548 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
1550 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1551 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1552 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
1553 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1554 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
1555 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1557 print $_,"\n";
1560 close STDOUT;