OpenSSL 1.0.2f
[tomato.git] / release / src / router / openssl / crypto / aes / asm / aesni-mb-x86_64.pl
blobd7ad7882c4ee97fab997c4e2a33424acea6a8f77
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # Multi-buffer AES-NI procedures process several independent buffers
11 # in parallel by interleaving independent instructions.
13 # Cycles per byte for interleave factor 4:
15 # asymptotic measured
16 # ---------------------------
17 # Westmere 5.00/4=1.25 5.13/4=1.28
18 # Atom 15.0/4=3.75 ?15.7/4=3.93
19 # Sandy Bridge 5.06/4=1.27 5.18/4=1.29
20 # Ivy Bridge 5.06/4=1.27 5.14/4=1.29
21 # Haswell 4.44/4=1.11 4.44/4=1.11
22 # Bulldozer 5.75/4=1.44 5.76/4=1.44
24 # Cycles per byte for interleave factor 8 (not implemented for
25 # pre-AVX processors, where higher interleave factor incidentally
26 # doesn't result in improvement):
28 # asymptotic measured
29 # ---------------------------
30 # Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*)
31 # Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*)
32 # Haswell 5.00/8=0.63 5.00/8=0.63
33 # Bulldozer 5.75/8=0.72 5.77/8=0.72
35 # (*) Sandy/Ivy Bridge are known to handle high interleave factors
36 # suboptimally;
38 $flavour = shift;
39 $output = shift;
40 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
42 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
46 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
47 die "can't locate x86_64-xlate.pl";
49 $avx=0;
51 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
52 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
53 $avx = ($1>=2.19) + ($1>=2.22);
56 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
57 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
58 $avx = ($1>=2.09) + ($1>=2.10);
61 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
62 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
63 $avx = ($1>=10) + ($1>=11);
66 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
67 $avx = ($2>=3.0) + ($2>3.0);
70 open OUT,"| \"$^X\" $xlate $flavour $output";
71 *STDOUT=*OUT;
73 # void aesni_multi_cbc_encrypt (
74 # struct { void *inp,*out; int blocks; double iv[2]; } inp[8];
75 # const AES_KEY *key,
76 # int num); /* 1 or 2 */
78 $inp="%rdi"; # 1st arg
79 $key="%rsi"; # 2nd arg
80 $num="%edx";
82 @inptr=map("%r$_",(8..11));
83 @outptr=map("%r$_",(12..15));
85 ($rndkey0,$rndkey1)=("%xmm0","%xmm1");
86 @out=map("%xmm$_",(2..5));
87 @inp=map("%xmm$_",(6..9));
88 ($counters,$mask,$zero)=map("%xmm$_",(10..12));
90 ($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
92 $code.=<<___;
93 .text
95 .extern OPENSSL_ia32cap_P
97 .globl aesni_multi_cbc_encrypt
98 .type aesni_multi_cbc_encrypt,\@function,3
99 .align 32
100 aesni_multi_cbc_encrypt:
102 $code.=<<___ if ($avx);
103 cmp \$2,$num
104 jb .Lenc_non_avx
105 mov OPENSSL_ia32cap_P+4(%rip),%ecx
106 test \$`1<<28`,%ecx # AVX bit
107 jnz _avx_cbc_enc_shortcut
108 jmp .Lenc_non_avx
109 .align 16
110 .Lenc_non_avx:
112 $code.=<<___;
113 mov %rsp,%rax
114 push %rbx
115 push %rbp
116 push %r12
117 push %r13
118 push %r14
119 push %r15
121 $code.=<<___ if ($win64);
122 lea -0xa8(%rsp),%rsp
123 movaps %xmm6,(%rsp)
124 movaps %xmm7,0x10(%rsp)
125 movaps %xmm8,0x20(%rsp)
126 movaps %xmm9,0x30(%rsp)
127 movaps %xmm10,0x40(%rsp)
128 movaps %xmm11,0x50(%rsp)
129 movaps %xmm12,0x60(%rsp)
130 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
131 movaps %xmm14,-0x58(%rax)
132 movaps %xmm15,-0x48(%rax)
134 $code.=<<___;
135 # stack layout
137 # +0 output sink
138 # +16 input sink [original %rsp and $num]
139 # +32 counters
141 sub \$48,%rsp
142 and \$-64,%rsp
143 mov %rax,16(%rsp) # original %rsp
145 .Lenc4x_body:
146 movdqu ($key),$zero # 0-round key
147 lea 0x78($key),$key # size optimization
148 lea 40*2($inp),$inp
150 .Lenc4x_loop_grande:
151 mov $num,24(%rsp) # original $num
152 xor $num,$num
154 for($i=0;$i<4;$i++) {
155 $code.=<<___;
156 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
157 mov `40*$i+0-40*2`($inp),@inptr[$i]
158 cmp $num,$one
159 mov `40*$i+8-40*2`($inp),@outptr[$i]
160 cmovg $one,$num # find maximum
161 test $one,$one
162 movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV
163 mov $one,`32+4*$i`(%rsp) # initialize counters
164 cmovle %rsp,@inptr[$i] # cancel input
167 $code.=<<___;
168 test $num,$num
169 jz .Lenc4x_done
171 movups 0x10-0x78($key),$rndkey1
172 pxor $zero,@out[0]
173 movups 0x20-0x78($key),$rndkey0
174 pxor $zero,@out[1]
175 mov 0xf0-0x78($key),$rounds
176 pxor $zero,@out[2]
177 movdqu (@inptr[0]),@inp[0] # load inputs
178 pxor $zero,@out[3]
179 movdqu (@inptr[1]),@inp[1]
180 pxor @inp[0],@out[0]
181 movdqu (@inptr[2]),@inp[2]
182 pxor @inp[1],@out[1]
183 movdqu (@inptr[3]),@inp[3]
184 pxor @inp[2],@out[2]
185 pxor @inp[3],@out[3]
186 movdqa 32(%rsp),$counters # load counters
187 xor $offset,$offset
188 jmp .Loop_enc4x
190 .align 32
191 .Loop_enc4x:
192 add \$16,$offset
193 lea 16(%rsp),$sink # sink pointer
194 mov \$1,$one # constant of 1
195 sub $offset,$sink
197 aesenc $rndkey1,@out[0]
198 prefetcht0 31(@inptr[0],$offset) # prefetch input
199 prefetcht0 31(@inptr[1],$offset)
200 aesenc $rndkey1,@out[1]
201 prefetcht0 31(@inptr[2],$offset)
202 prefetcht0 31(@inptr[2],$offset)
203 aesenc $rndkey1,@out[2]
204 aesenc $rndkey1,@out[3]
205 movups 0x30-0x78($key),$rndkey1
207 for($i=0;$i<4;$i++) {
208 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
209 $code.=<<___;
210 cmp `32+4*$i`(%rsp),$one
211 aesenc $rndkey,@out[0]
212 aesenc $rndkey,@out[1]
213 aesenc $rndkey,@out[2]
214 cmovge $sink,@inptr[$i] # cancel input
215 cmovg $sink,@outptr[$i] # sink output
216 aesenc $rndkey,@out[3]
217 movups `0x40+16*$i-0x78`($key),$rndkey
220 $code.=<<___;
221 movdqa $counters,$mask
222 aesenc $rndkey0,@out[0]
223 prefetcht0 15(@outptr[0],$offset) # prefetch output
224 prefetcht0 15(@outptr[1],$offset)
225 aesenc $rndkey0,@out[1]
226 prefetcht0 15(@outptr[2],$offset)
227 prefetcht0 15(@outptr[3],$offset)
228 aesenc $rndkey0,@out[2]
229 aesenc $rndkey0,@out[3]
230 movups 0x80-0x78($key),$rndkey0
231 pxor $zero,$zero
233 aesenc $rndkey1,@out[0]
234 pcmpgtd $zero,$mask
235 movdqu -0x78($key),$zero # reload 0-round key
236 aesenc $rndkey1,@out[1]
237 paddd $mask,$counters # decrement counters
238 movdqa $counters,32(%rsp) # update counters
239 aesenc $rndkey1,@out[2]
240 aesenc $rndkey1,@out[3]
241 movups 0x90-0x78($key),$rndkey1
243 cmp \$11,$rounds
245 aesenc $rndkey0,@out[0]
246 aesenc $rndkey0,@out[1]
247 aesenc $rndkey0,@out[2]
248 aesenc $rndkey0,@out[3]
249 movups 0xa0-0x78($key),$rndkey0
251 jb .Lenc4x_tail
253 aesenc $rndkey1,@out[0]
254 aesenc $rndkey1,@out[1]
255 aesenc $rndkey1,@out[2]
256 aesenc $rndkey1,@out[3]
257 movups 0xb0-0x78($key),$rndkey1
259 aesenc $rndkey0,@out[0]
260 aesenc $rndkey0,@out[1]
261 aesenc $rndkey0,@out[2]
262 aesenc $rndkey0,@out[3]
263 movups 0xc0-0x78($key),$rndkey0
265 je .Lenc4x_tail
267 aesenc $rndkey1,@out[0]
268 aesenc $rndkey1,@out[1]
269 aesenc $rndkey1,@out[2]
270 aesenc $rndkey1,@out[3]
271 movups 0xd0-0x78($key),$rndkey1
273 aesenc $rndkey0,@out[0]
274 aesenc $rndkey0,@out[1]
275 aesenc $rndkey0,@out[2]
276 aesenc $rndkey0,@out[3]
277 movups 0xe0-0x78($key),$rndkey0
278 jmp .Lenc4x_tail
280 .align 32
281 .Lenc4x_tail:
282 aesenc $rndkey1,@out[0]
283 aesenc $rndkey1,@out[1]
284 aesenc $rndkey1,@out[2]
285 aesenc $rndkey1,@out[3]
286 movdqu (@inptr[0],$offset),@inp[0]
287 movdqu 0x10-0x78($key),$rndkey1
289 aesenclast $rndkey0,@out[0]
290 movdqu (@inptr[1],$offset),@inp[1]
291 pxor $zero,@inp[0]
292 aesenclast $rndkey0,@out[1]
293 movdqu (@inptr[2],$offset),@inp[2]
294 pxor $zero,@inp[1]
295 aesenclast $rndkey0,@out[2]
296 movdqu (@inptr[3],$offset),@inp[3]
297 pxor $zero,@inp[2]
298 aesenclast $rndkey0,@out[3]
299 movdqu 0x20-0x78($key),$rndkey0
300 pxor $zero,@inp[3]
302 movups @out[0],-16(@outptr[0],$offset)
303 pxor @inp[0],@out[0]
304 movups @out[1],-16(@outptr[1],$offset)
305 pxor @inp[1],@out[1]
306 movups @out[2],-16(@outptr[2],$offset)
307 pxor @inp[2],@out[2]
308 movups @out[3],-16(@outptr[3],$offset)
309 pxor @inp[3],@out[3]
311 dec $num
312 jnz .Loop_enc4x
314 mov 16(%rsp),%rax # original %rsp
315 mov 24(%rsp),$num
317 #pxor @inp[0],@out[0]
318 #pxor @inp[1],@out[1]
319 #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME!
320 #pxor @inp[2],@out[2]
321 #movdqu @out[1],`40*1+24-40*2`($inp)
322 #pxor @inp[3],@out[3]
323 #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller
324 #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out...
326 lea `40*4`($inp),$inp
327 dec $num
328 jnz .Lenc4x_loop_grande
330 .Lenc4x_done:
332 $code.=<<___ if ($win64);
333 movaps -0xd8(%rax),%xmm6
334 movaps -0xc8(%rax),%xmm7
335 movaps -0xb8(%rax),%xmm8
336 movaps -0xa8(%rax),%xmm9
337 movaps -0x98(%rax),%xmm10
338 movaps -0x88(%rax),%xmm11
339 movaps -0x78(%rax),%xmm12
340 #movaps -0x68(%rax),%xmm13
341 #movaps -0x58(%rax),%xmm14
342 #movaps -0x48(%rax),%xmm15
344 $code.=<<___;
345 mov -48(%rax),%r15
346 mov -40(%rax),%r14
347 mov -32(%rax),%r13
348 mov -24(%rax),%r12
349 mov -16(%rax),%rbp
350 mov -8(%rax),%rbx
351 lea (%rax),%rsp
352 .Lenc4x_epilogue:
354 .size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
356 .globl aesni_multi_cbc_decrypt
357 .type aesni_multi_cbc_decrypt,\@function,3
358 .align 32
359 aesni_multi_cbc_decrypt:
361 $code.=<<___ if ($avx);
362 cmp \$2,$num
363 jb .Ldec_non_avx
364 mov OPENSSL_ia32cap_P+4(%rip),%ecx
365 test \$`1<<28`,%ecx # AVX bit
366 jnz _avx_cbc_dec_shortcut
367 jmp .Ldec_non_avx
368 .align 16
369 .Ldec_non_avx:
371 $code.=<<___;
372 mov %rsp,%rax
373 push %rbx
374 push %rbp
375 push %r12
376 push %r13
377 push %r14
378 push %r15
380 $code.=<<___ if ($win64);
381 lea -0xa8(%rsp),%rsp
382 movaps %xmm6,(%rsp)
383 movaps %xmm7,0x10(%rsp)
384 movaps %xmm8,0x20(%rsp)
385 movaps %xmm9,0x30(%rsp)
386 movaps %xmm10,0x40(%rsp)
387 movaps %xmm11,0x50(%rsp)
388 movaps %xmm12,0x60(%rsp)
389 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
390 movaps %xmm14,-0x58(%rax)
391 movaps %xmm15,-0x48(%rax)
393 $code.=<<___;
394 # stack layout
396 # +0 output sink
397 # +16 input sink [original %rsp and $num]
398 # +32 counters
400 sub \$48,%rsp
401 and \$-64,%rsp
402 mov %rax,16(%rsp) # original %rsp
404 .Ldec4x_body:
405 movdqu ($key),$zero # 0-round key
406 lea 0x78($key),$key # size optimization
407 lea 40*2($inp),$inp
409 .Ldec4x_loop_grande:
410 mov $num,24(%rsp) # original $num
411 xor $num,$num
413 for($i=0;$i<4;$i++) {
414 $code.=<<___;
415 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
416 mov `40*$i+0-40*2`($inp),@inptr[$i]
417 cmp $num,$one
418 mov `40*$i+8-40*2`($inp),@outptr[$i]
419 cmovg $one,$num # find maximum
420 test $one,$one
421 movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV
422 mov $one,`32+4*$i`(%rsp) # initialize counters
423 cmovle %rsp,@inptr[$i] # cancel input
426 $code.=<<___;
427 test $num,$num
428 jz .Ldec4x_done
430 movups 0x10-0x78($key),$rndkey1
431 movups 0x20-0x78($key),$rndkey0
432 mov 0xf0-0x78($key),$rounds
433 movdqu (@inptr[0]),@out[0] # load inputs
434 movdqu (@inptr[1]),@out[1]
435 pxor $zero,@out[0]
436 movdqu (@inptr[2]),@out[2]
437 pxor $zero,@out[1]
438 movdqu (@inptr[3]),@out[3]
439 pxor $zero,@out[2]
440 pxor $zero,@out[3]
441 movdqa 32(%rsp),$counters # load counters
442 xor $offset,$offset
443 jmp .Loop_dec4x
445 .align 32
446 .Loop_dec4x:
447 add \$16,$offset
448 lea 16(%rsp),$sink # sink pointer
449 mov \$1,$one # constant of 1
450 sub $offset,$sink
452 aesdec $rndkey1,@out[0]
453 prefetcht0 31(@inptr[0],$offset) # prefetch input
454 prefetcht0 31(@inptr[1],$offset)
455 aesdec $rndkey1,@out[1]
456 prefetcht0 31(@inptr[2],$offset)
457 prefetcht0 31(@inptr[3],$offset)
458 aesdec $rndkey1,@out[2]
459 aesdec $rndkey1,@out[3]
460 movups 0x30-0x78($key),$rndkey1
462 for($i=0;$i<4;$i++) {
463 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
464 $code.=<<___;
465 cmp `32+4*$i`(%rsp),$one
466 aesdec $rndkey,@out[0]
467 aesdec $rndkey,@out[1]
468 aesdec $rndkey,@out[2]
469 cmovge $sink,@inptr[$i] # cancel input
470 cmovg $sink,@outptr[$i] # sink output
471 aesdec $rndkey,@out[3]
472 movups `0x40+16*$i-0x78`($key),$rndkey
475 $code.=<<___;
476 movdqa $counters,$mask
477 aesdec $rndkey0,@out[0]
478 prefetcht0 15(@outptr[0],$offset) # prefetch output
479 prefetcht0 15(@outptr[1],$offset)
480 aesdec $rndkey0,@out[1]
481 prefetcht0 15(@outptr[2],$offset)
482 prefetcht0 15(@outptr[3],$offset)
483 aesdec $rndkey0,@out[2]
484 aesdec $rndkey0,@out[3]
485 movups 0x80-0x78($key),$rndkey0
486 pxor $zero,$zero
488 aesdec $rndkey1,@out[0]
489 pcmpgtd $zero,$mask
490 movdqu -0x78($key),$zero # reload 0-round key
491 aesdec $rndkey1,@out[1]
492 paddd $mask,$counters # decrement counters
493 movdqa $counters,32(%rsp) # update counters
494 aesdec $rndkey1,@out[2]
495 aesdec $rndkey1,@out[3]
496 movups 0x90-0x78($key),$rndkey1
498 cmp \$11,$rounds
500 aesdec $rndkey0,@out[0]
501 aesdec $rndkey0,@out[1]
502 aesdec $rndkey0,@out[2]
503 aesdec $rndkey0,@out[3]
504 movups 0xa0-0x78($key),$rndkey0
506 jb .Ldec4x_tail
508 aesdec $rndkey1,@out[0]
509 aesdec $rndkey1,@out[1]
510 aesdec $rndkey1,@out[2]
511 aesdec $rndkey1,@out[3]
512 movups 0xb0-0x78($key),$rndkey1
514 aesdec $rndkey0,@out[0]
515 aesdec $rndkey0,@out[1]
516 aesdec $rndkey0,@out[2]
517 aesdec $rndkey0,@out[3]
518 movups 0xc0-0x78($key),$rndkey0
520 je .Ldec4x_tail
522 aesdec $rndkey1,@out[0]
523 aesdec $rndkey1,@out[1]
524 aesdec $rndkey1,@out[2]
525 aesdec $rndkey1,@out[3]
526 movups 0xd0-0x78($key),$rndkey1
528 aesdec $rndkey0,@out[0]
529 aesdec $rndkey0,@out[1]
530 aesdec $rndkey0,@out[2]
531 aesdec $rndkey0,@out[3]
532 movups 0xe0-0x78($key),$rndkey0
533 jmp .Ldec4x_tail
535 .align 32
536 .Ldec4x_tail:
537 aesdec $rndkey1,@out[0]
538 aesdec $rndkey1,@out[1]
539 aesdec $rndkey1,@out[2]
540 pxor $rndkey0,@inp[0]
541 pxor $rndkey0,@inp[1]
542 aesdec $rndkey1,@out[3]
543 movdqu 0x10-0x78($key),$rndkey1
544 pxor $rndkey0,@inp[2]
545 pxor $rndkey0,@inp[3]
546 movdqu 0x20-0x78($key),$rndkey0
548 aesdeclast @inp[0],@out[0]
549 aesdeclast @inp[1],@out[1]
550 movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
551 movdqu -16(@inptr[1],$offset),@inp[1]
552 aesdeclast @inp[2],@out[2]
553 aesdeclast @inp[3],@out[3]
554 movdqu -16(@inptr[2],$offset),@inp[2]
555 movdqu -16(@inptr[3],$offset),@inp[3]
557 movups @out[0],-16(@outptr[0],$offset)
558 movdqu (@inptr[0],$offset),@out[0]
559 movups @out[1],-16(@outptr[1],$offset)
560 movdqu (@inptr[1],$offset),@out[1]
561 pxor $zero,@out[0]
562 movups @out[2],-16(@outptr[2],$offset)
563 movdqu (@inptr[2],$offset),@out[2]
564 pxor $zero,@out[1]
565 movups @out[3],-16(@outptr[3],$offset)
566 movdqu (@inptr[3],$offset),@out[3]
567 pxor $zero,@out[2]
568 pxor $zero,@out[3]
570 dec $num
571 jnz .Loop_dec4x
573 mov 16(%rsp),%rax # original %rsp
574 mov 24(%rsp),$num
576 lea `40*4`($inp),$inp
577 dec $num
578 jnz .Ldec4x_loop_grande
580 .Ldec4x_done:
582 $code.=<<___ if ($win64);
583 movaps -0xd8(%rax),%xmm6
584 movaps -0xc8(%rax),%xmm7
585 movaps -0xb8(%rax),%xmm8
586 movaps -0xa8(%rax),%xmm9
587 movaps -0x98(%rax),%xmm10
588 movaps -0x88(%rax),%xmm11
589 movaps -0x78(%rax),%xmm12
590 #movaps -0x68(%rax),%xmm13
591 #movaps -0x58(%rax),%xmm14
592 #movaps -0x48(%rax),%xmm15
594 $code.=<<___;
595 mov -48(%rax),%r15
596 mov -40(%rax),%r14
597 mov -32(%rax),%r13
598 mov -24(%rax),%r12
599 mov -16(%rax),%rbp
600 mov -8(%rax),%rbx
601 lea (%rax),%rsp
602 .Ldec4x_epilogue:
604 .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
607 if ($avx) {{{
608 my @ptr=map("%r$_",(8..15));
609 my $offload=$sink;
611 my @out=map("%xmm$_",(2..9));
612 my @inp=map("%xmm$_",(10..13));
613 my ($counters,$zero)=("%xmm14","%xmm15");
615 $code.=<<___;
616 .type aesni_multi_cbc_encrypt_avx,\@function,3
617 .align 32
618 aesni_multi_cbc_encrypt_avx:
619 _avx_cbc_enc_shortcut:
620 mov %rsp,%rax
621 push %rbx
622 push %rbp
623 push %r12
624 push %r13
625 push %r14
626 push %r15
628 $code.=<<___ if ($win64);
629 lea -0xa8(%rsp),%rsp
630 movaps %xmm6,(%rsp)
631 movaps %xmm7,0x10(%rsp)
632 movaps %xmm8,0x20(%rsp)
633 movaps %xmm9,0x30(%rsp)
634 movaps %xmm10,0x40(%rsp)
635 movaps %xmm11,0x50(%rsp)
636 movaps %xmm12,-0x78(%rax)
637 movaps %xmm13,-0x68(%rax)
638 movaps %xmm14,-0x58(%rax)
639 movaps %xmm15,-0x48(%rax)
641 $code.=<<___;
642 # stack layout
644 # +0 output sink
645 # +16 input sink [original %rsp and $num]
646 # +32 counters
647 # +64 distances between inputs and outputs
648 # +128 off-load area for @inp[0..3]
650 sub \$192,%rsp
651 and \$-128,%rsp
652 mov %rax,16(%rsp) # original %rsp
654 .Lenc8x_body:
655 vzeroupper
656 vmovdqu ($key),$zero # 0-round key
657 lea 0x78($key),$key # size optimization
658 lea 40*4($inp),$inp
659 shr \$1,$num
661 .Lenc8x_loop_grande:
662 #mov $num,24(%rsp) # original $num
663 xor $num,$num
665 for($i=0;$i<8;$i++) {
666 my $temp = $i ? $offload : $offset;
667 $code.=<<___;
668 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
669 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
670 cmp $num,$one
671 mov `40*$i+8-40*4`($inp),$temp # output pointer
672 cmovg $one,$num # find maximum
673 test $one,$one
674 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
675 mov $one,`32+4*$i`(%rsp) # initialize counters
676 cmovle %rsp,@ptr[$i] # cancel input
677 sub @ptr[$i],$temp # distance between input and output
678 mov $temp,`64+8*$i`(%rsp) # initialize distances
681 $code.=<<___;
682 test $num,$num
683 jz .Lenc8x_done
685 vmovups 0x10-0x78($key),$rndkey1
686 vmovups 0x20-0x78($key),$rndkey0
687 mov 0xf0-0x78($key),$rounds
689 vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round
690 lea 128(%rsp),$offload # offload area
691 vpxor (@ptr[1]),$zero,@inp[1]
692 vpxor (@ptr[2]),$zero,@inp[2]
693 vpxor (@ptr[3]),$zero,@inp[3]
694 vpxor @inp[0],@out[0],@out[0]
695 vpxor (@ptr[4]),$zero,@inp[0]
696 vpxor @inp[1],@out[1],@out[1]
697 vpxor (@ptr[5]),$zero,@inp[1]
698 vpxor @inp[2],@out[2],@out[2]
699 vpxor (@ptr[6]),$zero,@inp[2]
700 vpxor @inp[3],@out[3],@out[3]
701 vpxor (@ptr[7]),$zero,@inp[3]
702 vpxor @inp[0],@out[4],@out[4]
703 mov \$1,$one # constant of 1
704 vpxor @inp[1],@out[5],@out[5]
705 vpxor @inp[2],@out[6],@out[6]
706 vpxor @inp[3],@out[7],@out[7]
707 jmp .Loop_enc8x
709 .align 32
710 .Loop_enc8x:
712 for($i=0;$i<8;$i++) {
713 my $rndkey=($i&1)?$rndkey0:$rndkey1;
714 $code.=<<___;
715 vaesenc $rndkey,@out[0],@out[0]
716 cmp 32+4*$i(%rsp),$one
718 $code.=<<___ if ($i);
719 mov 64+8*$i(%rsp),$offset
721 $code.=<<___;
722 vaesenc $rndkey,@out[1],@out[1]
723 prefetcht0 31(@ptr[$i]) # prefetch input
724 vaesenc $rndkey,@out[2],@out[2]
726 $code.=<<___ if ($i>1);
727 prefetcht0 15(@ptr[$i-2]) # prefetch output
729 $code.=<<___;
730 vaesenc $rndkey,@out[3],@out[3]
731 lea (@ptr[$i],$offset),$offset
732 cmovge %rsp,@ptr[$i] # cancel input
733 vaesenc $rndkey,@out[4],@out[4]
734 cmovg %rsp,$offset # sink output
735 vaesenc $rndkey,@out[5],@out[5]
736 sub @ptr[$i],$offset
737 vaesenc $rndkey,@out[6],@out[6]
738 vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round
739 mov $offset,64+8*$i(%rsp)
740 vaesenc $rndkey,@out[7],@out[7]
741 vmovups `16*(3+$i)-0x78`($key),$rndkey
742 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
744 $code.=<<___ if ($i<4)
745 vmovdqu @inp[$i%4],`16*$i`($offload) # off-load
748 $code.=<<___;
749 vmovdqu 32(%rsp),$counters
750 prefetcht0 15(@ptr[$i-2]) # prefetch output
751 prefetcht0 15(@ptr[$i-1])
752 cmp \$11,$rounds
753 jb .Lenc8x_tail
755 vaesenc $rndkey1,@out[0],@out[0]
756 vaesenc $rndkey1,@out[1],@out[1]
757 vaesenc $rndkey1,@out[2],@out[2]
758 vaesenc $rndkey1,@out[3],@out[3]
759 vaesenc $rndkey1,@out[4],@out[4]
760 vaesenc $rndkey1,@out[5],@out[5]
761 vaesenc $rndkey1,@out[6],@out[6]
762 vaesenc $rndkey1,@out[7],@out[7]
763 vmovups 0xb0-0x78($key),$rndkey1
765 vaesenc $rndkey0,@out[0],@out[0]
766 vaesenc $rndkey0,@out[1],@out[1]
767 vaesenc $rndkey0,@out[2],@out[2]
768 vaesenc $rndkey0,@out[3],@out[3]
769 vaesenc $rndkey0,@out[4],@out[4]
770 vaesenc $rndkey0,@out[5],@out[5]
771 vaesenc $rndkey0,@out[6],@out[6]
772 vaesenc $rndkey0,@out[7],@out[7]
773 vmovups 0xc0-0x78($key),$rndkey0
774 je .Lenc8x_tail
776 vaesenc $rndkey1,@out[0],@out[0]
777 vaesenc $rndkey1,@out[1],@out[1]
778 vaesenc $rndkey1,@out[2],@out[2]
779 vaesenc $rndkey1,@out[3],@out[3]
780 vaesenc $rndkey1,@out[4],@out[4]
781 vaesenc $rndkey1,@out[5],@out[5]
782 vaesenc $rndkey1,@out[6],@out[6]
783 vaesenc $rndkey1,@out[7],@out[7]
784 vmovups 0xd0-0x78($key),$rndkey1
786 vaesenc $rndkey0,@out[0],@out[0]
787 vaesenc $rndkey0,@out[1],@out[1]
788 vaesenc $rndkey0,@out[2],@out[2]
789 vaesenc $rndkey0,@out[3],@out[3]
790 vaesenc $rndkey0,@out[4],@out[4]
791 vaesenc $rndkey0,@out[5],@out[5]
792 vaesenc $rndkey0,@out[6],@out[6]
793 vaesenc $rndkey0,@out[7],@out[7]
794 vmovups 0xe0-0x78($key),$rndkey0
796 .Lenc8x_tail:
797 vaesenc $rndkey1,@out[0],@out[0]
798 vpxor $zero,$zero,$zero
799 vaesenc $rndkey1,@out[1],@out[1]
800 vaesenc $rndkey1,@out[2],@out[2]
801 vpcmpgtd $zero,$counters,$zero
802 vaesenc $rndkey1,@out[3],@out[3]
803 vaesenc $rndkey1,@out[4],@out[4]
804 vpaddd $counters,$zero,$zero # decrement counters
805 vmovdqu 48(%rsp),$counters
806 vaesenc $rndkey1,@out[5],@out[5]
807 mov 64(%rsp),$offset # pre-load 1st offset
808 vaesenc $rndkey1,@out[6],@out[6]
809 vaesenc $rndkey1,@out[7],@out[7]
810 vmovups 0x10-0x78($key),$rndkey1
812 vaesenclast $rndkey0,@out[0],@out[0]
813 vmovdqa $zero,32(%rsp) # update counters
814 vpxor $zero,$zero,$zero
815 vaesenclast $rndkey0,@out[1],@out[1]
816 vaesenclast $rndkey0,@out[2],@out[2]
817 vpcmpgtd $zero,$counters,$zero
818 vaesenclast $rndkey0,@out[3],@out[3]
819 vaesenclast $rndkey0,@out[4],@out[4]
820 vpaddd $zero,$counters,$counters # decrement counters
821 vmovdqu -0x78($key),$zero # 0-round
822 vaesenclast $rndkey0,@out[5],@out[5]
823 vaesenclast $rndkey0,@out[6],@out[6]
824 vmovdqa $counters,48(%rsp) # update counters
825 vaesenclast $rndkey0,@out[7],@out[7]
826 vmovups 0x20-0x78($key),$rndkey0
828 vmovups @out[0],-16(@ptr[0]) # write output
829 sub $offset,@ptr[0] # switch to input
830 vpxor 0x00($offload),@out[0],@out[0]
831 vmovups @out[1],-16(@ptr[1])
832 sub `64+1*8`(%rsp),@ptr[1]
833 vpxor 0x10($offload),@out[1],@out[1]
834 vmovups @out[2],-16(@ptr[2])
835 sub `64+2*8`(%rsp),@ptr[2]
836 vpxor 0x20($offload),@out[2],@out[2]
837 vmovups @out[3],-16(@ptr[3])
838 sub `64+3*8`(%rsp),@ptr[3]
839 vpxor 0x30($offload),@out[3],@out[3]
840 vmovups @out[4],-16(@ptr[4])
841 sub `64+4*8`(%rsp),@ptr[4]
842 vpxor @inp[0],@out[4],@out[4]
843 vmovups @out[5],-16(@ptr[5])
844 sub `64+5*8`(%rsp),@ptr[5]
845 vpxor @inp[1],@out[5],@out[5]
846 vmovups @out[6],-16(@ptr[6])
847 sub `64+6*8`(%rsp),@ptr[6]
848 vpxor @inp[2],@out[6],@out[6]
849 vmovups @out[7],-16(@ptr[7])
850 sub `64+7*8`(%rsp),@ptr[7]
851 vpxor @inp[3],@out[7],@out[7]
853 dec $num
854 jnz .Loop_enc8x
856 mov 16(%rsp),%rax # original %rsp
857 #mov 24(%rsp),$num
858 #lea `40*8`($inp),$inp
859 #dec $num
860 #jnz .Lenc8x_loop_grande
862 .Lenc8x_done:
863 vzeroupper
865 $code.=<<___ if ($win64);
866 movaps -0xd8(%rax),%xmm6
867 movaps -0xc8(%rax),%xmm7
868 movaps -0xb8(%rax),%xmm8
869 movaps -0xa8(%rax),%xmm9
870 movaps -0x98(%rax),%xmm10
871 movaps -0x88(%rax),%xmm11
872 movaps -0x78(%rax),%xmm12
873 movaps -0x68(%rax),%xmm13
874 movaps -0x58(%rax),%xmm14
875 movaps -0x48(%rax),%xmm15
877 $code.=<<___;
878 mov -48(%rax),%r15
879 mov -40(%rax),%r14
880 mov -32(%rax),%r13
881 mov -24(%rax),%r12
882 mov -16(%rax),%rbp
883 mov -8(%rax),%rbx
884 lea (%rax),%rsp
885 .Lenc8x_epilogue:
887 .size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
889 .type aesni_multi_cbc_decrypt_avx,\@function,3
890 .align 32
891 aesni_multi_cbc_decrypt_avx:
892 _avx_cbc_dec_shortcut:
893 mov %rsp,%rax
894 push %rbx
895 push %rbp
896 push %r12
897 push %r13
898 push %r14
899 push %r15
901 $code.=<<___ if ($win64);
902 lea -0xa8(%rsp),%rsp
903 movaps %xmm6,(%rsp)
904 movaps %xmm7,0x10(%rsp)
905 movaps %xmm8,0x20(%rsp)
906 movaps %xmm9,0x30(%rsp)
907 movaps %xmm10,0x40(%rsp)
908 movaps %xmm11,0x50(%rsp)
909 movaps %xmm12,-0x78(%rax)
910 movaps %xmm13,-0x68(%rax)
911 movaps %xmm14,-0x58(%rax)
912 movaps %xmm15,-0x48(%rax)
914 $code.=<<___;
915 # stack layout
917 # +0 output sink
918 # +16 input sink [original %rsp and $num]
919 # +32 counters
920 # +64 distances between inputs and outputs
921 # +128 off-load area for @inp[0..3]
922 # +192 IV/input offload
924 sub \$256,%rsp
925 and \$-256,%rsp
926 sub \$192,%rsp
927 mov %rax,16(%rsp) # original %rsp
929 .Ldec8x_body:
930 vzeroupper
931 vmovdqu ($key),$zero # 0-round key
932 lea 0x78($key),$key # size optimization
933 lea 40*4($inp),$inp
934 shr \$1,$num
936 .Ldec8x_loop_grande:
937 #mov $num,24(%rsp) # original $num
938 xor $num,$num
940 for($i=0;$i<8;$i++) {
941 my $temp = $i ? $offload : $offset;
942 $code.=<<___;
943 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
944 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
945 cmp $num,$one
946 mov `40*$i+8-40*4`($inp),$temp # output pointer
947 cmovg $one,$num # find maximum
948 test $one,$one
949 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
950 mov $one,`32+4*$i`(%rsp) # initialize counters
951 cmovle %rsp,@ptr[$i] # cancel input
952 sub @ptr[$i],$temp # distance between input and output
953 mov $temp,`64+8*$i`(%rsp) # initialize distances
954 vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV
957 $code.=<<___;
958 test $num,$num
959 jz .Ldec8x_done
961 vmovups 0x10-0x78($key),$rndkey1
962 vmovups 0x20-0x78($key),$rndkey0
963 mov 0xf0-0x78($key),$rounds
964 lea 192+128(%rsp),$offload # offload area
966 vmovdqu (@ptr[0]),@out[0] # load inputs
967 vmovdqu (@ptr[1]),@out[1]
968 vmovdqu (@ptr[2]),@out[2]
969 vmovdqu (@ptr[3]),@out[3]
970 vmovdqu (@ptr[4]),@out[4]
971 vmovdqu (@ptr[5]),@out[5]
972 vmovdqu (@ptr[6]),@out[6]
973 vmovdqu (@ptr[7]),@out[7]
974 vmovdqu @out[0],0x00($offload) # offload inputs
975 vpxor $zero,@out[0],@out[0] # xor inputs with 0-round
976 vmovdqu @out[1],0x10($offload)
977 vpxor $zero,@out[1],@out[1]
978 vmovdqu @out[2],0x20($offload)
979 vpxor $zero,@out[2],@out[2]
980 vmovdqu @out[3],0x30($offload)
981 vpxor $zero,@out[3],@out[3]
982 vmovdqu @out[4],0x40($offload)
983 vpxor $zero,@out[4],@out[4]
984 vmovdqu @out[5],0x50($offload)
985 vpxor $zero,@out[5],@out[5]
986 vmovdqu @out[6],0x60($offload)
987 vpxor $zero,@out[6],@out[6]
988 vmovdqu @out[7],0x70($offload)
989 vpxor $zero,@out[7],@out[7]
990 xor \$0x80,$offload
991 mov \$1,$one # constant of 1
992 jmp .Loop_dec8x
994 .align 32
995 .Loop_dec8x:
997 for($i=0;$i<8;$i++) {
998 my $rndkey=($i&1)?$rndkey0:$rndkey1;
999 $code.=<<___;
1000 vaesdec $rndkey,@out[0],@out[0]
1001 cmp 32+4*$i(%rsp),$one
1003 $code.=<<___ if ($i);
1004 mov 64+8*$i(%rsp),$offset
1006 $code.=<<___;
1007 vaesdec $rndkey,@out[1],@out[1]
1008 prefetcht0 31(@ptr[$i]) # prefetch input
1009 vaesdec $rndkey,@out[2],@out[2]
1011 $code.=<<___ if ($i>1);
1012 prefetcht0 15(@ptr[$i-2]) # prefetch output
1014 $code.=<<___;
1015 vaesdec $rndkey,@out[3],@out[3]
1016 lea (@ptr[$i],$offset),$offset
1017 cmovge %rsp,@ptr[$i] # cancel input
1018 vaesdec $rndkey,@out[4],@out[4]
1019 cmovg %rsp,$offset # sink output
1020 vaesdec $rndkey,@out[5],@out[5]
1021 sub @ptr[$i],$offset
1022 vaesdec $rndkey,@out[6],@out[6]
1023 vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input
1024 mov $offset,64+8*$i(%rsp)
1025 vaesdec $rndkey,@out[7],@out[7]
1026 vmovups `16*(3+$i)-0x78`($key),$rndkey
1027 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
1029 $code.=<<___ if ($i<4);
1030 vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load
1033 $code.=<<___;
1034 vmovdqu 32(%rsp),$counters
1035 prefetcht0 15(@ptr[$i-2]) # prefetch output
1036 prefetcht0 15(@ptr[$i-1])
1037 cmp \$11,$rounds
1038 jb .Ldec8x_tail
1040 vaesdec $rndkey1,@out[0],@out[0]
1041 vaesdec $rndkey1,@out[1],@out[1]
1042 vaesdec $rndkey1,@out[2],@out[2]
1043 vaesdec $rndkey1,@out[3],@out[3]
1044 vaesdec $rndkey1,@out[4],@out[4]
1045 vaesdec $rndkey1,@out[5],@out[5]
1046 vaesdec $rndkey1,@out[6],@out[6]
1047 vaesdec $rndkey1,@out[7],@out[7]
1048 vmovups 0xb0-0x78($key),$rndkey1
1050 vaesdec $rndkey0,@out[0],@out[0]
1051 vaesdec $rndkey0,@out[1],@out[1]
1052 vaesdec $rndkey0,@out[2],@out[2]
1053 vaesdec $rndkey0,@out[3],@out[3]
1054 vaesdec $rndkey0,@out[4],@out[4]
1055 vaesdec $rndkey0,@out[5],@out[5]
1056 vaesdec $rndkey0,@out[6],@out[6]
1057 vaesdec $rndkey0,@out[7],@out[7]
1058 vmovups 0xc0-0x78($key),$rndkey0
1059 je .Ldec8x_tail
1061 vaesdec $rndkey1,@out[0],@out[0]
1062 vaesdec $rndkey1,@out[1],@out[1]
1063 vaesdec $rndkey1,@out[2],@out[2]
1064 vaesdec $rndkey1,@out[3],@out[3]
1065 vaesdec $rndkey1,@out[4],@out[4]
1066 vaesdec $rndkey1,@out[5],@out[5]
1067 vaesdec $rndkey1,@out[6],@out[6]
1068 vaesdec $rndkey1,@out[7],@out[7]
1069 vmovups 0xd0-0x78($key),$rndkey1
1071 vaesdec $rndkey0,@out[0],@out[0]
1072 vaesdec $rndkey0,@out[1],@out[1]
1073 vaesdec $rndkey0,@out[2],@out[2]
1074 vaesdec $rndkey0,@out[3],@out[3]
1075 vaesdec $rndkey0,@out[4],@out[4]
1076 vaesdec $rndkey0,@out[5],@out[5]
1077 vaesdec $rndkey0,@out[6],@out[6]
1078 vaesdec $rndkey0,@out[7],@out[7]
1079 vmovups 0xe0-0x78($key),$rndkey0
1081 .Ldec8x_tail:
1082 vaesdec $rndkey1,@out[0],@out[0]
1083 vpxor $zero,$zero,$zero
1084 vaesdec $rndkey1,@out[1],@out[1]
1085 vaesdec $rndkey1,@out[2],@out[2]
1086 vpcmpgtd $zero,$counters,$zero
1087 vaesdec $rndkey1,@out[3],@out[3]
1088 vaesdec $rndkey1,@out[4],@out[4]
1089 vpaddd $counters,$zero,$zero # decrement counters
1090 vmovdqu 48(%rsp),$counters
1091 vaesdec $rndkey1,@out[5],@out[5]
1092 mov 64(%rsp),$offset # pre-load 1st offset
1093 vaesdec $rndkey1,@out[6],@out[6]
1094 vaesdec $rndkey1,@out[7],@out[7]
1095 vmovups 0x10-0x78($key),$rndkey1
1097 vaesdeclast $rndkey0,@out[0],@out[0]
1098 vmovdqa $zero,32(%rsp) # update counters
1099 vpxor $zero,$zero,$zero
1100 vaesdeclast $rndkey0,@out[1],@out[1]
1101 vpxor 0x00($offload),@out[0],@out[0] # xor with IV
1102 vaesdeclast $rndkey0,@out[2],@out[2]
1103 vpxor 0x10($offload),@out[1],@out[1]
1104 vpcmpgtd $zero,$counters,$zero
1105 vaesdeclast $rndkey0,@out[3],@out[3]
1106 vpxor 0x20($offload),@out[2],@out[2]
1107 vaesdeclast $rndkey0,@out[4],@out[4]
1108 vpxor 0x30($offload),@out[3],@out[3]
1109 vpaddd $zero,$counters,$counters # decrement counters
1110 vmovdqu -0x78($key),$zero # 0-round
1111 vaesdeclast $rndkey0,@out[5],@out[5]
1112 vpxor 0x40($offload),@out[4],@out[4]
1113 vaesdeclast $rndkey0,@out[6],@out[6]
1114 vpxor 0x50($offload),@out[5],@out[5]
1115 vmovdqa $counters,48(%rsp) # update counters
1116 vaesdeclast $rndkey0,@out[7],@out[7]
1117 vpxor 0x60($offload),@out[6],@out[6]
1118 vmovups 0x20-0x78($key),$rndkey0
1120 vmovups @out[0],-16(@ptr[0]) # write output
1121 sub $offset,@ptr[0] # switch to input
1122 vmovdqu 128+0(%rsp),@out[0]
1123 vpxor 0x70($offload),@out[7],@out[7]
1124 vmovups @out[1],-16(@ptr[1])
1125 sub `64+1*8`(%rsp),@ptr[1]
1126 vmovdqu @out[0],0x00($offload)
1127 vpxor $zero,@out[0],@out[0]
1128 vmovdqu 128+16(%rsp),@out[1]
1129 vmovups @out[2],-16(@ptr[2])
1130 sub `64+2*8`(%rsp),@ptr[2]
1131 vmovdqu @out[1],0x10($offload)
1132 vpxor $zero,@out[1],@out[1]
1133 vmovdqu 128+32(%rsp),@out[2]
1134 vmovups @out[3],-16(@ptr[3])
1135 sub `64+3*8`(%rsp),@ptr[3]
1136 vmovdqu @out[2],0x20($offload)
1137 vpxor $zero,@out[2],@out[2]
1138 vmovdqu 128+48(%rsp),@out[3]
1139 vmovups @out[4],-16(@ptr[4])
1140 sub `64+4*8`(%rsp),@ptr[4]
1141 vmovdqu @out[3],0x30($offload)
1142 vpxor $zero,@out[3],@out[3]
1143 vmovdqu @inp[0],0x40($offload)
1144 vpxor @inp[0],$zero,@out[4]
1145 vmovups @out[5],-16(@ptr[5])
1146 sub `64+5*8`(%rsp),@ptr[5]
1147 vmovdqu @inp[1],0x50($offload)
1148 vpxor @inp[1],$zero,@out[5]
1149 vmovups @out[6],-16(@ptr[6])
1150 sub `64+6*8`(%rsp),@ptr[6]
1151 vmovdqu @inp[2],0x60($offload)
1152 vpxor @inp[2],$zero,@out[6]
1153 vmovups @out[7],-16(@ptr[7])
1154 sub `64+7*8`(%rsp),@ptr[7]
1155 vmovdqu @inp[3],0x70($offload)
1156 vpxor @inp[3],$zero,@out[7]
1158 xor \$128,$offload
1159 dec $num
1160 jnz .Loop_dec8x
1162 mov 16(%rsp),%rax # original %rsp
1163 #mov 24(%rsp),$num
1164 #lea `40*8`($inp),$inp
1165 #dec $num
1166 #jnz .Ldec8x_loop_grande
1168 .Ldec8x_done:
1169 vzeroupper
1171 $code.=<<___ if ($win64);
1172 movaps -0xd8(%rax),%xmm6
1173 movaps -0xc8(%rax),%xmm7
1174 movaps -0xb8(%rax),%xmm8
1175 movaps -0xa8(%rax),%xmm9
1176 movaps -0x98(%rax),%xmm10
1177 movaps -0x88(%rax),%xmm11
1178 movaps -0x78(%rax),%xmm12
1179 movaps -0x68(%rax),%xmm13
1180 movaps -0x58(%rax),%xmm14
1181 movaps -0x48(%rax),%xmm15
1183 $code.=<<___;
1184 mov -48(%rax),%r15
1185 mov -40(%rax),%r14
1186 mov -32(%rax),%r13
1187 mov -24(%rax),%r12
1188 mov -16(%rax),%rbp
1189 mov -8(%rax),%rbx
1190 lea (%rax),%rsp
1191 .Ldec8x_epilogue:
1193 .size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1197 if ($win64) {
1198 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1199 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1200 $rec="%rcx";
1201 $frame="%rdx";
1202 $context="%r8";
1203 $disp="%r9";
1205 $code.=<<___;
1206 .extern __imp_RtlVirtualUnwind
1207 .type se_handler,\@abi-omnipotent
1208 .align 16
1209 se_handler:
1210 push %rsi
1211 push %rdi
1212 push %rbx
1213 push %rbp
1214 push %r12
1215 push %r13
1216 push %r14
1217 push %r15
1218 pushfq
1219 sub \$64,%rsp
1221 mov 120($context),%rax # pull context->Rax
1222 mov 248($context),%rbx # pull context->Rip
1224 mov 8($disp),%rsi # disp->ImageBase
1225 mov 56($disp),%r11 # disp->HandlerData
1227 mov 0(%r11),%r10d # HandlerData[0]
1228 lea (%rsi,%r10),%r10 # prologue label
1229 cmp %r10,%rbx # context->Rip<.Lprologue
1230 jb .Lin_prologue
1232 mov 152($context),%rax # pull context->Rsp
1234 mov 4(%r11),%r10d # HandlerData[1]
1235 lea (%rsi,%r10),%r10 # epilogue label
1236 cmp %r10,%rbx # context->Rip>=.Lepilogue
1237 jae .Lin_prologue
1239 mov 16(%rax),%rax # pull saved stack pointer
1241 mov -8(%rax),%rbx
1242 mov -16(%rax),%rbp
1243 mov -24(%rax),%r12
1244 mov -32(%rax),%r13
1245 mov -40(%rax),%r14
1246 mov -48(%rax),%r15
1247 mov %rbx,144($context) # restore context->Rbx
1248 mov %rbp,160($context) # restore context->Rbp
1249 mov %r12,216($context) # restore cotnext->R12
1250 mov %r13,224($context) # restore cotnext->R13
1251 mov %r14,232($context) # restore cotnext->R14
1252 mov %r15,240($context) # restore cotnext->R15
1254 lea -56-10*16(%rax),%rsi
1255 lea 512($context),%rdi # &context.Xmm6
1256 mov \$20,%ecx
1257 .long 0xa548f3fc # cld; rep movsq
1259 .Lin_prologue:
1260 mov 8(%rax),%rdi
1261 mov 16(%rax),%rsi
1262 mov %rax,152($context) # restore context->Rsp
1263 mov %rsi,168($context) # restore context->Rsi
1264 mov %rdi,176($context) # restore context->Rdi
1266 mov 40($disp),%rdi # disp->ContextRecord
1267 mov $context,%rsi # context
1268 mov \$154,%ecx # sizeof(CONTEXT)
1269 .long 0xa548f3fc # cld; rep movsq
1271 mov $disp,%rsi
1272 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1273 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1274 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1275 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1276 mov 40(%rsi),%r10 # disp->ContextRecord
1277 lea 56(%rsi),%r11 # &disp->HandlerData
1278 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1279 mov %r10,32(%rsp) # arg5
1280 mov %r11,40(%rsp) # arg6
1281 mov %r12,48(%rsp) # arg7
1282 mov %rcx,56(%rsp) # arg8, (NULL)
1283 call *__imp_RtlVirtualUnwind(%rip)
1285 mov \$1,%eax # ExceptionContinueSearch
1286 add \$64,%rsp
1287 popfq
1288 pop %r15
1289 pop %r14
1290 pop %r13
1291 pop %r12
1292 pop %rbp
1293 pop %rbx
1294 pop %rdi
1295 pop %rsi
1297 .size se_handler,.-se_handler
1299 .section .pdata
1300 .align 4
1301 .rva .LSEH_begin_aesni_multi_cbc_encrypt
1302 .rva .LSEH_end_aesni_multi_cbc_encrypt
1303 .rva .LSEH_info_aesni_multi_cbc_encrypt
1304 .rva .LSEH_begin_aesni_multi_cbc_decrypt
1305 .rva .LSEH_end_aesni_multi_cbc_decrypt
1306 .rva .LSEH_info_aesni_multi_cbc_decrypt
1308 $code.=<<___ if ($avx);
1309 .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx
1310 .rva .LSEH_end_aesni_multi_cbc_encrypt_avx
1311 .rva .LSEH_info_aesni_multi_cbc_encrypt_avx
1312 .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx
1313 .rva .LSEH_end_aesni_multi_cbc_decrypt_avx
1314 .rva .LSEH_info_aesni_multi_cbc_decrypt_avx
1316 $code.=<<___;
1317 .section .xdata
1318 .align 8
1319 .LSEH_info_aesni_multi_cbc_encrypt:
1320 .byte 9,0,0,0
1321 .rva se_handler
1322 .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[]
1323 .LSEH_info_aesni_multi_cbc_decrypt:
1324 .byte 9,0,0,0
1325 .rva se_handler
1326 .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[]
1328 $code.=<<___ if ($avx);
1329 .LSEH_info_aesni_multi_cbc_encrypt_avx:
1330 .byte 9,0,0,0
1331 .rva se_handler
1332 .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[]
1333 .LSEH_info_aesni_multi_cbc_decrypt_avx:
1334 .byte 9,0,0,0
1335 .rva se_handler
1336 .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[]
1339 ####################################################################
1341 sub rex {
1342 local *opcode=shift;
1343 my ($dst,$src)=@_;
1344 my $rex=0;
1346 $rex|=0x04 if($dst>=8);
1347 $rex|=0x01 if($src>=8);
1348 push @opcode,$rex|0x40 if($rex);
1351 sub aesni {
1352 my $line=shift;
1353 my @opcode=(0x66);
1355 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1356 rex(\@opcode,$4,$3);
1357 push @opcode,0x0f,0x3a,0xdf;
1358 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
1359 my $c=$2;
1360 push @opcode,$c=~/^0/?oct($c):$c;
1361 return ".byte\t".join(',',@opcode);
1363 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1364 my %opcodelet = (
1365 "aesimc" => 0xdb,
1366 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1367 "aesdec" => 0xde, "aesdeclast" => 0xdf
1369 return undef if (!defined($opcodelet{$1}));
1370 rex(\@opcode,$3,$2);
1371 push @opcode,0x0f,0x38,$opcodelet{$1};
1372 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
1373 return ".byte\t".join(',',@opcode);
1375 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
1376 my %opcodelet = (
1377 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1378 "aesdec" => 0xde, "aesdeclast" => 0xdf
1380 return undef if (!defined($opcodelet{$1}));
1381 my $off = $2;
1382 push @opcode,0x44 if ($3>=8);
1383 push @opcode,0x0f,0x38,$opcodelet{$1};
1384 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
1385 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
1386 return ".byte\t".join(',',@opcode);
1388 return $line;
1391 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1392 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1394 print $code;
1395 close STDOUT;