3 ######################################################################
4 ## Constant-time SSSE3 AES core implementation.
7 ## By Mike Hamburg (Stanford University), 2009
10 ## For details see http://shiftleft.org/papers/vector_aes/ and
11 ## http://crypto.stanford.edu/vpaes/.
13 ######################################################################
16 # Interface to OpenSSL as "almost" drop-in replacement for
17 # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
18 # doesn't handle partial vectors (doesn't have to if called from
19 # EVP only). "Drop-in" implies that this module doesn't share key
20 # schedule structure with the original nor does it make assumption
21 # about its alignment...
23 # Performance summary. aes-x86_64.pl column lists large-block CBC
24 # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25 # byte processed with 128-bit key, and vpaes-x86_64.pl column -
26 # [also large-block CBC] encrypt/decrypt.
28 # aes-x86_64.pl vpaes-x86_64.pl
30 # Core 2(**) 30.5/43.7/14.3 21.8/25.7(***)
31 # Nehalem 30.5/42.2/14.6 9.8/11.8
32 # Atom 63.9/79.0/32.1 64.0/84.8(***)
34 # (*) "Hyper-threading" in the context refers rather to cache shared
35 # among multiple cores, than to specifically Intel HTT. As vast
36 # majority of contemporary cores share cache, slower code path
37 # is common place. In other words "with-hyper-threading-off"
38 # results are presented mostly for reference purposes.
40 # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
42 # (***) Less impressive improvement on Core 2 and Atom is due to slow
43 # pshufb, yet it's respectable +40%/78% improvement on Core 2
44 # (as implied, over "hyper-threading-safe" code path).
50 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
52 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
54 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
55 ( $xlate="${dir}x86_64-xlate.pl" and -f
$xlate ) or
56 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f
$xlate) or
57 die "can't locate x86_64-xlate.pl";
59 open OUT
,"| \"$^X\" $xlate $flavour $output";
74 ## %xmm9-%xmm15 as in _vpaes_preheat
75 ## (%rdx) = scheduled keys
78 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
79 ## Preserves %xmm6 - %xmm8 so you get some local vectors
82 .type _vpaes_encrypt_core
,\
@abi-omnipotent
89 movdqa
.Lk_ipt
(%rip), %xmm2 # iptlo
91 movdqu
(%r9), %xmm5 # round0 key
95 movdqa
.Lk_ipt
+16(%rip), %xmm0 # ipthi
100 lea
.Lk_mc_backward
(%rip),%r10
105 # middle of middle round
106 movdqa
%xmm13, %xmm4 # 4 : sb1u
107 pshufb
%xmm2, %xmm4 # 4 = sb1u
108 pxor
%xmm5, %xmm4 # 4 = sb1u + k
109 movdqa
%xmm12, %xmm0 # 0 : sb1t
110 pshufb
%xmm3, %xmm0 # 0 = sb1t
111 pxor
%xmm4, %xmm0 # 0 = A
112 movdqa
%xmm15, %xmm5 # 4 : sb2u
113 pshufb
%xmm2, %xmm5 # 4 = sb2u
114 movdqa
-0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
115 movdqa
%xmm14, %xmm2 # 2 : sb2t
116 pshufb
%xmm3, %xmm2 # 2 = sb2t
117 pxor
%xmm5, %xmm2 # 2 = 2A
118 movdqa
(%r11,%r10), %xmm4 # .Lk_mc_backward[]
119 movdqa
%xmm0, %xmm3 # 3 = A
120 pshufb
%xmm1, %xmm0 # 0 = B
121 add \
$16, %r9 # next key
122 pxor
%xmm2, %xmm0 # 0 = 2A+B
123 pshufb
%xmm4, %xmm3 # 3 = D
124 add \
$16, %r11 # next mc
125 pxor
%xmm0, %xmm3 # 3 = 2A+B+D
126 pshufb
%xmm1, %xmm0 # 0 = 2B+C
127 and \
$0x30, %r11 # ... mod 4
128 pxor
%xmm3, %xmm0 # 0 = 2A+3B+C+D
133 movdqa
%xmm9, %xmm1 # 1 : i
134 pandn
%xmm0, %xmm1 # 1 = i<<4
135 psrld \
$4, %xmm1 # 1 = i
136 pand
%xmm9, %xmm0 # 0 = k
137 movdqa
%xmm11, %xmm5 # 2 : a/k
138 pshufb
%xmm0, %xmm5 # 2 = a/k
139 pxor
%xmm1, %xmm0 # 0 = j
140 movdqa
%xmm10, %xmm3 # 3 : 1/i
141 pshufb
%xmm1, %xmm3 # 3 = 1/i
142 pxor
%xmm5, %xmm3 # 3 = iak = 1/i + a/k
143 movdqa
%xmm10, %xmm4 # 4 : 1/j
144 pshufb
%xmm0, %xmm4 # 4 = 1/j
145 pxor
%xmm5, %xmm4 # 4 = jak = 1/j + a/k
146 movdqa
%xmm10, %xmm2 # 2 : 1/iak
147 pshufb
%xmm3, %xmm2 # 2 = 1/iak
148 pxor
%xmm0, %xmm2 # 2 = io
149 movdqa
%xmm10, %xmm3 # 3 : 1/jak
151 pshufb
%xmm4, %xmm3 # 3 = 1/jak
152 pxor
%xmm1, %xmm3 # 3 = jo
155 # middle of last round
156 movdqa
-0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
157 movdqa
-0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
158 pshufb
%xmm2, %xmm4 # 4 = sbou
159 pxor
%xmm5, %xmm4 # 4 = sb1u + k
160 pshufb
%xmm3, %xmm0 # 0 = sb1t
161 movdqa
0x40(%r11,%r10), %xmm1 # .Lk_sr[]
162 pxor
%xmm4, %xmm0 # 0 = A
165 .size _vpaes_encrypt_core
,.-_vpaes_encrypt_core
170 ## Same API as encryption core.
172 .type _vpaes_decrypt_core
,\
@abi-omnipotent
175 mov
%rdx, %r9 # load key
178 movdqa
.Lk_dipt
(%rip), %xmm2 # iptlo
182 movdqu
(%r9), %xmm5 # round0 key
186 movdqa
.Lk_dipt
+16(%rip), %xmm0 # ipthi
188 lea
.Lk_dsbd
(%rip),%r10
192 movdqa
.Lk_mc_forward
+48(%rip), %xmm5
201 ## Inverse mix columns
203 movdqa
-0x20(%r10),%xmm4 # 4 : sb9u
204 pshufb
%xmm2, %xmm4 # 4 = sb9u
206 movdqa
-0x10(%r10),%xmm0 # 0 : sb9t
207 pshufb
%xmm3, %xmm0 # 0 = sb9t
208 pxor
%xmm4, %xmm0 # 0 = ch
209 add \
$16, %r9 # next round key
211 pshufb
%xmm5, %xmm0 # MC ch
212 movdqa
0x00(%r10),%xmm4 # 4 : sbdu
213 pshufb
%xmm2, %xmm4 # 4 = sbdu
214 pxor
%xmm0, %xmm4 # 4 = ch
215 movdqa
0x10(%r10),%xmm0 # 0 : sbdt
216 pshufb
%xmm3, %xmm0 # 0 = sbdt
217 pxor
%xmm4, %xmm0 # 0 = ch
220 pshufb
%xmm5, %xmm0 # MC ch
221 movdqa
0x20(%r10),%xmm4 # 4 : sbbu
222 pshufb
%xmm2, %xmm4 # 4 = sbbu
223 pxor
%xmm0, %xmm4 # 4 = ch
224 movdqa
0x30(%r10),%xmm0 # 0 : sbbt
225 pshufb
%xmm3, %xmm0 # 0 = sbbt
226 pxor
%xmm4, %xmm0 # 0 = ch
228 pshufb
%xmm5, %xmm0 # MC ch
229 movdqa
0x40(%r10),%xmm4 # 4 : sbeu
230 pshufb
%xmm2, %xmm4 # 4 = sbeu
231 pxor
%xmm0, %xmm4 # 4 = ch
232 movdqa
0x50(%r10),%xmm0 # 0 : sbet
233 pshufb
%xmm3, %xmm0 # 0 = sbet
234 pxor
%xmm4, %xmm0 # 0 = ch
236 palignr \
$12, %xmm5, %xmm5
240 movdqa
%xmm9, %xmm1 # 1 : i
241 pandn
%xmm0, %xmm1 # 1 = i<<4
242 psrld \
$4, %xmm1 # 1 = i
243 pand
%xmm9, %xmm0 # 0 = k
244 movdqa
%xmm11, %xmm2 # 2 : a/k
245 pshufb
%xmm0, %xmm2 # 2 = a/k
246 pxor
%xmm1, %xmm0 # 0 = j
247 movdqa
%xmm10, %xmm3 # 3 : 1/i
248 pshufb
%xmm1, %xmm3 # 3 = 1/i
249 pxor
%xmm2, %xmm3 # 3 = iak = 1/i + a/k
250 movdqa
%xmm10, %xmm4 # 4 : 1/j
251 pshufb
%xmm0, %xmm4 # 4 = 1/j
252 pxor
%xmm2, %xmm4 # 4 = jak = 1/j + a/k
253 movdqa
%xmm10, %xmm2 # 2 : 1/iak
254 pshufb
%xmm3, %xmm2 # 2 = 1/iak
255 pxor
%xmm0, %xmm2 # 2 = io
256 movdqa
%xmm10, %xmm3 # 3 : 1/jak
257 pshufb
%xmm4, %xmm3 # 3 = 1/jak
258 pxor
%xmm1, %xmm3 # 3 = jo
262 # middle of last round
263 movdqa
0x60(%r10), %xmm4 # 3 : sbou
264 pshufb
%xmm2, %xmm4 # 4 = sbou
265 pxor
%xmm0, %xmm4 # 4 = sb1u + k
266 movdqa
0x70(%r10), %xmm0 # 0 : sbot
267 movdqa
-0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
268 pshufb
%xmm3, %xmm0 # 0 = sb1t
269 pxor
%xmm4, %xmm0 # 0 = A
272 .size _vpaes_decrypt_core
,.-_vpaes_decrypt_core
274 ########################################################
276 ## AES key schedule ##
278 ########################################################
279 .type _vpaes_schedule_core
,\
@abi-omnipotent
281 _vpaes_schedule_core
:
285 # rcx = direction. 0=encrypt, 1=decrypt
287 call _vpaes_preheat
# load the tables
288 movdqa
.Lk_rcon
(%rip), %xmm8 # load rcon
289 movdqu
(%rdi), %xmm0 # load key (unaligned)
293 lea
.Lk_ipt
(%rip), %r11
294 call _vpaes_schedule_transform
297 lea
.Lk_sr
(%rip),%r10
299 jnz
.Lschedule_am_decrypting
301 # encrypting, output zeroth round key after transform
305 .Lschedule_am_decrypting
:
306 # decrypting, output zeroth round key after shiftrows
307 movdqa
(%r8,%r10),%xmm1
321 ## 128-bit specific part of key schedule.
323 ## This schedule is really simple, because all its parts
324 ## are accomplished by the subroutines.
330 call _vpaes_schedule_round
332 jz
.Lschedule_mangle_last
333 call _vpaes_schedule_mangle
# write output
334 jmp
.Loop_schedule_128
339 ## 192-bit specific part of key schedule.
341 ## The main body of this schedule is the same as the 128-bit
342 ## schedule, but with more smearing. The long, high side is
343 ## stored in %xmm7 as before, and the short, low side is in
344 ## the high bits of %xmm6.
346 ## This schedule is somewhat nastier, however, because each
347 ## round produces 192 bits of key material, or 1.5 round keys.
348 ## Therefore, on each cycle we do 2 rounds and produce 3 round
353 movdqu
8(%rdi),%xmm0 # load key part 2 (very unaligned)
354 call _vpaes_schedule_transform
# input transform
355 movdqa
%xmm0, %xmm6 # save short part
356 pxor
%xmm4, %xmm4 # clear 4
357 movhlps
%xmm4, %xmm6 # clobber low side with zeros
361 call _vpaes_schedule_round
362 palignr \
$8,%xmm6,%xmm0
363 call _vpaes_schedule_mangle
# save key n
364 call _vpaes_schedule_192_smear
365 call _vpaes_schedule_mangle
# save key n+1
366 call _vpaes_schedule_round
368 jz
.Lschedule_mangle_last
369 call _vpaes_schedule_mangle
# save key n+2
370 call _vpaes_schedule_192_smear
371 jmp
.Loop_schedule_192
376 ## 256-bit specific part of key schedule.
378 ## The structure here is very similar to the 128-bit
379 ## schedule, but with an additional "low side" in
380 ## %xmm6. The low side's rounds are the same as the
381 ## high side's, except no rcon and no rotation.
385 movdqu
16(%rdi),%xmm0 # load key part 2 (unaligned)
386 call _vpaes_schedule_transform
# input transform
390 call _vpaes_schedule_mangle
# output low result
391 movdqa
%xmm0, %xmm6 # save cur_lo in xmm6
394 call _vpaes_schedule_round
396 jz
.Lschedule_mangle_last
397 call _vpaes_schedule_mangle
399 # low round. swap xmm7 and xmm6
400 pshufd \
$0xFF, %xmm0, %xmm0
403 call _vpaes_schedule_low_round
406 jmp
.Loop_schedule_256
410 ## .aes_schedule_mangle_last
412 ## Mangler for last round of key schedule
414 ## when encrypting, outputs out(%xmm0) ^ 63
415 ## when decrypting, outputs unskew(%xmm0)
417 ## Always called right before return... jumps to cleanup and exits
420 .Lschedule_mangle_last
:
421 # schedule last round key from xmm0
422 lea
.Lk_deskew
(%rip),%r11 # prepare to deskew
424 jnz
.Lschedule_mangle_last_dec
427 movdqa
(%r8,%r10),%xmm1
428 pshufb
%xmm1, %xmm0 # output permute
429 lea
.Lk_opt
(%rip), %r11 # prepare to output transform
432 .Lschedule_mangle_last_dec
:
434 pxor
.Lk_s63
(%rip), %xmm0
435 call _vpaes_schedule_transform
# output transform
436 movdqu
%xmm0, (%rdx) # save last key
448 .size _vpaes_schedule_core
,.-_vpaes_schedule_core
451 ## .aes_schedule_192_smear
453 ## Smear the short, low side in the 192-bit key schedule.
456 ## %xmm7: high side, b a x y
457 ## %xmm6: low side, d c 0 0
461 ## %xmm6: b+c+d b+c 0 0
462 ## %xmm0: b+c+d b+c b a
464 .type _vpaes_schedule_192_smear
,\
@abi-omnipotent
466 _vpaes_schedule_192_smear
:
467 pshufd \
$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
468 pxor
%xmm0, %xmm6 # -> c+d c 0 0
469 pshufd \
$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
470 pxor
%xmm0, %xmm6 # -> b+c+d b+c b a
473 movhlps
%xmm1, %xmm6 # clobber low side with zeros
475 .size _vpaes_schedule_192_smear
,.-_vpaes_schedule_192_smear
478 ## .aes_schedule_round
480 ## Runs one main round of the key schedule on %xmm0, %xmm7
482 ## Specifically, runs subbytes on the high dword of %xmm0
483 ## then rotates it by one byte and xors into the low dword of
486 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
489 ## Smears the dwords of %xmm7 by xoring the low into the
490 ## second low, result into third, result into highest.
492 ## Returns results in %xmm7 = %xmm0.
493 ## Clobbers %xmm1-%xmm4, %r11.
495 .type _vpaes_schedule_round
,\
@abi-omnipotent
497 _vpaes_schedule_round
:
498 # extract rcon from xmm8
500 palignr \
$15, %xmm8, %xmm1
501 palignr \
$15, %xmm8, %xmm8
505 pshufd \
$0xFF, %xmm0, %xmm0
506 palignr \
$1, %xmm0, %xmm0
510 # low round: same as high round, but no rotation and no rcon.
511 _vpaes_schedule_low_round
:
519 pxor
.Lk_s63
(%rip), %xmm7
524 psrld \
$4, %xmm1 # 1 = i
525 pand
%xmm9, %xmm0 # 0 = k
526 movdqa
%xmm11, %xmm2 # 2 : a/k
527 pshufb
%xmm0, %xmm2 # 2 = a/k
528 pxor
%xmm1, %xmm0 # 0 = j
529 movdqa
%xmm10, %xmm3 # 3 : 1/i
530 pshufb
%xmm1, %xmm3 # 3 = 1/i
531 pxor
%xmm2, %xmm3 # 3 = iak = 1/i + a/k
532 movdqa
%xmm10, %xmm4 # 4 : 1/j
533 pshufb
%xmm0, %xmm4 # 4 = 1/j
534 pxor
%xmm2, %xmm4 # 4 = jak = 1/j + a/k
535 movdqa
%xmm10, %xmm2 # 2 : 1/iak
536 pshufb
%xmm3, %xmm2 # 2 = 1/iak
537 pxor
%xmm0, %xmm2 # 2 = io
538 movdqa
%xmm10, %xmm3 # 3 : 1/jak
539 pshufb
%xmm4, %xmm3 # 3 = 1/jak
540 pxor
%xmm1, %xmm3 # 3 = jo
541 movdqa
%xmm13, %xmm4 # 4 : sbou
542 pshufb
%xmm2, %xmm4 # 4 = sbou
543 movdqa
%xmm12, %xmm0 # 0 : sbot
544 pshufb
%xmm3, %xmm0 # 0 = sb1t
545 pxor
%xmm4, %xmm0 # 0 = sbox output
547 # add in smeared stuff
551 .size _vpaes_schedule_round
,.-_vpaes_schedule_round
554 ## .aes_schedule_transform
556 ## Linear-transform %xmm0 according to tables at (%r11)
558 ## Requires that %xmm9 = 0x0F0F... as in preheat
560 ## Clobbers %xmm1, %xmm2
562 .type _vpaes_schedule_transform
,\
@abi-omnipotent
564 _vpaes_schedule_transform
:
569 movdqa
(%r11), %xmm2 # lo
571 movdqa
16(%r11), %xmm0 # hi
575 .size _vpaes_schedule_transform
,.-_vpaes_schedule_transform
578 ## .aes_schedule_mangle
580 ## Mangle xmm0 from (basis-transformed) standard version
585 ## multiply by circulant 0,1,1,1
586 ## apply shiftrows transform
590 ## multiply by "inverse mixcolumns" circulant E,B,D,9
592 ## apply shiftrows transform
595 ## Writes out to (%rdx), and increments or decrements it
596 ## Keeps track of round number mod 4 in %r8
598 ## Clobbers xmm1-xmm5
600 .type _vpaes_schedule_mangle
,\
@abi-omnipotent
602 _vpaes_schedule_mangle
:
603 movdqa
%xmm0, %xmm4 # save xmm0 for later
604 movdqa
.Lk_mc_forward
(%rip),%xmm5
606 jnz
.Lschedule_mangle_dec
610 pxor
.Lk_s63
(%rip),%xmm4
618 jmp
.Lschedule_mangle_both
620 .Lschedule_mangle_dec
:
621 # inverse mix columns
622 lea
.Lk_dksd
(%rip),%r11
625 psrld \
$4, %xmm1 # 1 = hi
626 pand
%xmm9, %xmm4 # 4 = lo
628 movdqa
0x00(%r11), %xmm2
630 movdqa
0x10(%r11), %xmm3
635 movdqa
0x20(%r11), %xmm2
638 movdqa
0x30(%r11), %xmm3
643 movdqa
0x40(%r11), %xmm2
646 movdqa
0x50(%r11), %xmm3
651 movdqa
0x60(%r11), %xmm2
654 movdqa
0x70(%r11), %xmm3
660 .Lschedule_mangle_both
:
661 movdqa
(%r8,%r10),%xmm1
667 .size _vpaes_schedule_mangle
,.-_vpaes_schedule_mangle
670 # Interface to OpenSSL
672 .globl
${PREFIX
}_set_encrypt_key
673 .type
${PREFIX
}_set_encrypt_key
,\
@function,3
675 ${PREFIX
}_set_encrypt_key
:
677 $code.=<<___
if ($win64);
679 movaps
%xmm6,0x10(%rsp)
680 movaps
%xmm7,0x20(%rsp)
681 movaps
%xmm8,0x30(%rsp)
682 movaps
%xmm9,0x40(%rsp)
683 movaps
%xmm10,0x50(%rsp)
684 movaps
%xmm11,0x60(%rsp)
685 movaps
%xmm12,0x70(%rsp)
686 movaps
%xmm13,0x80(%rsp)
687 movaps
%xmm14,0x90(%rsp)
688 movaps
%xmm15,0xa0(%rsp)
695 mov
%eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
699 call _vpaes_schedule_core
701 $code.=<<___
if ($win64);
702 movaps
0x10(%rsp),%xmm6
703 movaps
0x20(%rsp),%xmm7
704 movaps
0x30(%rsp),%xmm8
705 movaps
0x40(%rsp),%xmm9
706 movaps
0x50(%rsp),%xmm10
707 movaps
0x60(%rsp),%xmm11
708 movaps
0x70(%rsp),%xmm12
709 movaps
0x80(%rsp),%xmm13
710 movaps
0x90(%rsp),%xmm14
711 movaps
0xa0(%rsp),%xmm15
718 .size
${PREFIX
}_set_encrypt_key
,.-${PREFIX
}_set_encrypt_key
720 .globl
${PREFIX
}_set_decrypt_key
721 .type
${PREFIX
}_set_decrypt_key
,\
@function,3
723 ${PREFIX
}_set_decrypt_key
:
725 $code.=<<___
if ($win64);
727 movaps
%xmm6,0x10(%rsp)
728 movaps
%xmm7,0x20(%rsp)
729 movaps
%xmm8,0x30(%rsp)
730 movaps
%xmm9,0x40(%rsp)
731 movaps
%xmm10,0x50(%rsp)
732 movaps
%xmm11,0x60(%rsp)
733 movaps
%xmm12,0x70(%rsp)
734 movaps
%xmm13,0x80(%rsp)
735 movaps
%xmm14,0x90(%rsp)
736 movaps
%xmm15,0xa0(%rsp)
743 mov
%eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
745 lea
16(%rdx,%rax),%rdx
751 xor \
$32,%r8d # nbits==192?0:32
752 call _vpaes_schedule_core
754 $code.=<<___
if ($win64);
755 movaps
0x10(%rsp),%xmm6
756 movaps
0x20(%rsp),%xmm7
757 movaps
0x30(%rsp),%xmm8
758 movaps
0x40(%rsp),%xmm9
759 movaps
0x50(%rsp),%xmm10
760 movaps
0x60(%rsp),%xmm11
761 movaps
0x70(%rsp),%xmm12
762 movaps
0x80(%rsp),%xmm13
763 movaps
0x90(%rsp),%xmm14
764 movaps
0xa0(%rsp),%xmm15
771 .size
${PREFIX
}_set_decrypt_key
,.-${PREFIX
}_set_decrypt_key
773 .globl
${PREFIX
}_encrypt
774 .type
${PREFIX
}_encrypt
,\
@function,3
778 $code.=<<___
if ($win64);
780 movaps
%xmm6,0x10(%rsp)
781 movaps
%xmm7,0x20(%rsp)
782 movaps
%xmm8,0x30(%rsp)
783 movaps
%xmm9,0x40(%rsp)
784 movaps
%xmm10,0x50(%rsp)
785 movaps
%xmm11,0x60(%rsp)
786 movaps
%xmm12,0x70(%rsp)
787 movaps
%xmm13,0x80(%rsp)
788 movaps
%xmm14,0x90(%rsp)
789 movaps
%xmm15,0xa0(%rsp)
795 call _vpaes_encrypt_core
798 $code.=<<___
if ($win64);
799 movaps
0x10(%rsp),%xmm6
800 movaps
0x20(%rsp),%xmm7
801 movaps
0x30(%rsp),%xmm8
802 movaps
0x40(%rsp),%xmm9
803 movaps
0x50(%rsp),%xmm10
804 movaps
0x60(%rsp),%xmm11
805 movaps
0x70(%rsp),%xmm12
806 movaps
0x80(%rsp),%xmm13
807 movaps
0x90(%rsp),%xmm14
808 movaps
0xa0(%rsp),%xmm15
814 .size
${PREFIX
}_encrypt
,.-${PREFIX
}_encrypt
816 .globl
${PREFIX
}_decrypt
817 .type
${PREFIX
}_decrypt
,\
@function,3
821 $code.=<<___
if ($win64);
823 movaps
%xmm6,0x10(%rsp)
824 movaps
%xmm7,0x20(%rsp)
825 movaps
%xmm8,0x30(%rsp)
826 movaps
%xmm9,0x40(%rsp)
827 movaps
%xmm10,0x50(%rsp)
828 movaps
%xmm11,0x60(%rsp)
829 movaps
%xmm12,0x70(%rsp)
830 movaps
%xmm13,0x80(%rsp)
831 movaps
%xmm14,0x90(%rsp)
832 movaps
%xmm15,0xa0(%rsp)
838 call _vpaes_decrypt_core
841 $code.=<<___
if ($win64);
842 movaps
0x10(%rsp),%xmm6
843 movaps
0x20(%rsp),%xmm7
844 movaps
0x30(%rsp),%xmm8
845 movaps
0x40(%rsp),%xmm9
846 movaps
0x50(%rsp),%xmm10
847 movaps
0x60(%rsp),%xmm11
848 movaps
0x70(%rsp),%xmm12
849 movaps
0x80(%rsp),%xmm13
850 movaps
0x90(%rsp),%xmm14
851 movaps
0xa0(%rsp),%xmm15
857 .size
${PREFIX
}_decrypt
,.-${PREFIX
}_decrypt
860 my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
861 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
862 # size_t length, const AES_KEY *key,
863 # unsigned char *ivp,const int enc);
865 .globl
${PREFIX
}_cbc_encrypt
866 .type
${PREFIX
}_cbc_encrypt
,\
@function,6
868 ${PREFIX
}_cbc_encrypt
:
871 ($len,$key)=($key,$len);
876 $code.=<<___
if ($win64);
878 movaps
%xmm6,0x10(%rsp)
879 movaps
%xmm7,0x20(%rsp)
880 movaps
%xmm8,0x30(%rsp)
881 movaps
%xmm9,0x40(%rsp)
882 movaps
%xmm10,0x50(%rsp)
883 movaps
%xmm11,0x60(%rsp)
884 movaps
%xmm12,0x70(%rsp)
885 movaps
%xmm13,0x80(%rsp)
886 movaps
%xmm14,0x90(%rsp)
887 movaps
%xmm15,0xa0(%rsp)
891 movdqu
($ivp),%xmm6 # load IV
901 call _vpaes_encrypt_core
903 movdqu
%xmm0,($out,$inp)
912 call _vpaes_decrypt_core
915 movdqu
%xmm0,($out,$inp)
920 movdqu
%xmm6,($ivp) # save IV
922 $code.=<<___
if ($win64);
923 movaps
0x10(%rsp),%xmm6
924 movaps
0x20(%rsp),%xmm7
925 movaps
0x30(%rsp),%xmm8
926 movaps
0x40(%rsp),%xmm9
927 movaps
0x50(%rsp),%xmm10
928 movaps
0x60(%rsp),%xmm11
929 movaps
0x70(%rsp),%xmm12
930 movaps
0x80(%rsp),%xmm13
931 movaps
0x90(%rsp),%xmm14
932 movaps
0xa0(%rsp),%xmm15
939 .size
${PREFIX
}_cbc_encrypt
,.-${PREFIX
}_cbc_encrypt
946 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
947 ## and %xmm9-%xmm15 as specified below.
949 .type _vpaes_preheat
,\
@abi-omnipotent
952 lea
.Lk_s0F
(%rip), %r10
953 movdqa
-0x20(%r10), %xmm10 # .Lk_inv
954 movdqa
-0x10(%r10), %xmm11 # .Lk_inv+16
955 movdqa
0x00(%r10), %xmm9 # .Lk_s0F
956 movdqa
0x30(%r10), %xmm13 # .Lk_sb1
957 movdqa
0x40(%r10), %xmm12 # .Lk_sb1+16
958 movdqa
0x50(%r10), %xmm15 # .Lk_sb2
959 movdqa
0x60(%r10), %xmm14 # .Lk_sb2+16
961 .size _vpaes_preheat
,.-_vpaes_preheat
962 ########################################################
966 ########################################################
967 .type _vpaes_consts
,\
@object
971 .quad
0x0E05060F0D080180, 0x040703090A0B0C02
972 .quad
0x01040A060F0B0780, 0x030D0E0C02050809
975 .quad
0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
977 .Lk_ipt
: # input transform (lo, hi)
978 .quad
0xC2B2E8985A2A7000, 0xCABAE09052227808
979 .quad
0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
981 .Lk_sb1
: # sb1u, sb1t
982 .quad
0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
983 .quad
0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
984 .Lk_sb2
: # sb2u, sb2t
985 .quad
0xE27A93C60B712400, 0x5EB7E955BC982FCD
986 .quad
0x69EB88400AE12900, 0xC2A163C8AB82234A
987 .Lk_sbo
: # sbou, sbot
988 .quad
0xD0D26D176FBDC700, 0x15AABF7AC502A878
989 .quad
0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
991 .Lk_mc_forward
: # mc_forward
992 .quad
0x0407060500030201, 0x0C0F0E0D080B0A09
993 .quad
0x080B0A0904070605, 0x000302010C0F0E0D
994 .quad
0x0C0F0E0D080B0A09, 0x0407060500030201
995 .quad
0x000302010C0F0E0D, 0x080B0A0904070605
997 .Lk_mc_backward
:# mc_backward
998 .quad
0x0605040702010003, 0x0E0D0C0F0A09080B
999 .quad
0x020100030E0D0C0F, 0x0A09080B06050407
1000 .quad
0x0E0D0C0F0A09080B, 0x0605040702010003
1001 .quad
0x0A09080B06050407, 0x020100030E0D0C0F
1004 .quad
0x0706050403020100, 0x0F0E0D0C0B0A0908
1005 .quad
0x030E09040F0A0500, 0x0B06010C07020D08
1006 .quad
0x0F060D040B020900, 0x070E050C030A0108
1007 .quad
0x0B0E0104070A0D00, 0x0306090C0F020508
1010 .quad
0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1012 .Lk_s63
: # s63: all equal to 0x63 transformed
1013 .quad
0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1015 .Lk_opt
: # output transform
1016 .quad
0xFF9F4929D6B66000, 0xF7974121DEBE6808
1017 .quad
0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1019 .Lk_deskew
: # deskew tables: inverts the sbox's "skew"
1020 .quad
0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1021 .quad
0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1025 ## Key schedule constants
1027 .Lk_dksd
: # decryption key schedule: invskew x*D
1028 .quad
0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1029 .quad
0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1030 .Lk_dksb
: # decryption key schedule: invskew x*B
1031 .quad
0x9A4FCA1F8550D500, 0x03D653861CC94C99
1032 .quad
0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1033 .Lk_dkse
: # decryption key schedule: invskew x*E + 0x63
1034 .quad
0xD5031CCA1FC9D600, 0x53859A4C994F5086
1035 .quad
0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1036 .Lk_dks9
: # decryption key schedule: invskew x*9
1037 .quad
0xB6116FC87ED9A700, 0x4AED933482255BFC
1038 .quad
0x4576516227143300, 0x8BB89FACE9DAFDCE
1042 ## Round function constants
1044 .Lk_dipt
: # decryption input transform
1045 .quad
0x0F505B040B545F00, 0x154A411E114E451A
1046 .quad
0x86E383E660056500, 0x12771772F491F194
1048 .Lk_dsb9
: # decryption sbox output *9*u, *9*t
1049 .quad
0x851C03539A86D600, 0xCAD51F504F994CC9
1050 .quad
0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1051 .Lk_dsbd
: # decryption sbox output *D*u, *D*t
1052 .quad
0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1053 .quad
0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1054 .Lk_dsbb
: # decryption sbox output *B*u, *B*t
1055 .quad
0xD022649296B44200, 0x602646F6B0F2D404
1056 .quad
0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1057 .Lk_dsbe
: # decryption sbox output *E*u, *E*t
1058 .quad
0x46F2929626D4D000, 0x2242600464B4F6B0
1059 .quad
0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1060 .Lk_dsbo
: # decryption sbox final output
1061 .quad
0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1062 .quad
0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1063 .asciz
"Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
1065 .size _vpaes_consts
,.-_vpaes_consts
1069 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1070 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1077 .extern __imp_RtlVirtualUnwind
1078 .type se_handler
,\
@abi-omnipotent
1092 mov
120($context),%rax # pull context->Rax
1093 mov
248($context),%rbx # pull context->Rip
1095 mov
8($disp),%rsi # disp->ImageBase
1096 mov
56($disp),%r11 # disp->HandlerData
1098 mov
0(%r11),%r10d # HandlerData[0]
1099 lea
(%rsi,%r10),%r10 # prologue label
1100 cmp %r10,%rbx # context->Rip<prologue label
1103 mov
152($context),%rax # pull context->Rsp
1105 mov
4(%r11),%r10d # HandlerData[1]
1106 lea
(%rsi,%r10),%r10 # epilogue label
1107 cmp %r10,%rbx # context->Rip>=epilogue label
1110 lea
16(%rax),%rsi # %xmm save area
1111 lea
512($context),%rdi # &context.Xmm6
1112 mov \
$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1113 .long
0xa548f3fc # cld; rep movsq
1114 lea
0xb8(%rax),%rax # adjust stack pointer
1119 mov
%rax,152($context) # restore context->Rsp
1120 mov
%rsi,168($context) # restore context->Rsi
1121 mov
%rdi,176($context) # restore context->Rdi
1123 mov
40($disp),%rdi # disp->ContextRecord
1124 mov
$context,%rsi # context
1125 mov \
$`1232/8`,%ecx # sizeof(CONTEXT)
1126 .long
0xa548f3fc # cld; rep movsq
1129 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1130 mov
8(%rsi),%rdx # arg2, disp->ImageBase
1131 mov
0(%rsi),%r8 # arg3, disp->ControlPc
1132 mov
16(%rsi),%r9 # arg4, disp->FunctionEntry
1133 mov
40(%rsi),%r10 # disp->ContextRecord
1134 lea
56(%rsi),%r11 # &disp->HandlerData
1135 lea
24(%rsi),%r12 # &disp->EstablisherFrame
1136 mov
%r10,32(%rsp) # arg5
1137 mov
%r11,40(%rsp) # arg6
1138 mov
%r12,48(%rsp) # arg7
1139 mov
%rcx,56(%rsp) # arg8, (NULL)
1140 call
*__imp_RtlVirtualUnwind
(%rip)
1142 mov \
$1,%eax # ExceptionContinueSearch
1154 .size se_handler
,.-se_handler
1158 .rva
.LSEH_begin_
${PREFIX
}_set_encrypt_key
1159 .rva
.LSEH_end_
${PREFIX
}_set_encrypt_key
1160 .rva
.LSEH_info_
${PREFIX
}_set_encrypt_key
1162 .rva
.LSEH_begin_
${PREFIX
}_set_decrypt_key
1163 .rva
.LSEH_end_
${PREFIX
}_set_decrypt_key
1164 .rva
.LSEH_info_
${PREFIX
}_set_decrypt_key
1166 .rva
.LSEH_begin_
${PREFIX
}_encrypt
1167 .rva
.LSEH_end_
${PREFIX
}_encrypt
1168 .rva
.LSEH_info_
${PREFIX
}_encrypt
1170 .rva
.LSEH_begin_
${PREFIX
}_decrypt
1171 .rva
.LSEH_end_
${PREFIX
}_decrypt
1172 .rva
.LSEH_info_
${PREFIX
}_decrypt
1174 .rva
.LSEH_begin_
${PREFIX
}_cbc_encrypt
1175 .rva
.LSEH_end_
${PREFIX
}_cbc_encrypt
1176 .rva
.LSEH_info_
${PREFIX
}_cbc_encrypt
1180 .LSEH_info_
${PREFIX
}_set_encrypt_key
:
1183 .rva
.Lenc_key_body
,.Lenc_key_epilogue
# HandlerData[]
1184 .LSEH_info_
${PREFIX
}_set_decrypt_key
:
1187 .rva
.Ldec_key_body
,.Ldec_key_epilogue
# HandlerData[]
1188 .LSEH_info_
${PREFIX
}_encrypt
:
1191 .rva
.Lenc_body
,.Lenc_epilogue
# HandlerData[]
1192 .LSEH_info_
${PREFIX
}_decrypt
:
1195 .rva
.Ldec_body
,.Ldec_epilogue
# HandlerData[]
1196 .LSEH_info_
${PREFIX
}_cbc_encrypt
:
1199 .rva
.Lcbc_body
,.Lcbc_epilogue
# HandlerData[]
1203 $code =~ s/\`([^\`]*)\`/eval($1)/gem;