update libressl to v2.7.4
[unleashed.git] / lib / libcrypto / aes / asm / bsaes-x86_64.pl
blob41b90f08443f512bb0079910720936afff275fc7
1 #!/usr/bin/env perl
3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
9 ### Public domain ###
10 ### ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
15 # September 2011.
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
38 # Emilia's this(*) difference
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.98 +9%
42 # Atom 17.1 17.4 -2%(***)
44 # (*) Comparison is not completely fair, because "this" is ECB,
45 # i.e. no extra processing such as counter values calculation
46 # and xor-ing input as in Emilia's CTR implementation is
47 # performed. However, the CTR calculations stand for not more
48 # than 1% of total time, so comparison is *rather* fair.
50 # (**) Results were collected on Westmere, which is considered to
51 # be equivalent to Nehalem for this code.
53 # (***) Slowdown on Atom is rather strange per se, because original
54 # implementation has a number of 9+-bytes instructions, which
55 # are bad for Atom front-end, and which I eliminated completely.
56 # In attempt to address deterioration sbox() was tested in FP
57 # SIMD "domain" (movaps instead of movdqa, xorps instead of
58 # pxor, etc.). While it resulted in nominal 4% improvement on
59 # Atom, it hurted Westmere by more than 2x factor.
61 # As for key schedule conversion subroutine. Interface to OpenSSL
62 # relies on per-invocation on-the-fly conversion. This naturally
63 # has impact on performance, especially for short inputs. Conversion
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
65 # function is:
67 # conversion conversion/8x block
68 # Core 2 240 0.22
69 # Nehalem 180 0.20
70 # Atom 430 0.19
72 # The ratio values mean that 128-byte blocks will be processed
73 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74 # etc. Then keep in mind that input sizes not divisible by 128 are
75 # *effectively* slower, especially shortest ones, e.g. consecutive
76 # 144-byte blocks are processed 44% slower than one would expect,
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78 # it's still faster than ["hyper-threading-safe" code path in]
79 # aes-x86_64.pl on all lengths above 64 bytes...
81 # October 2011.
83 # Add decryption procedure. Performance in CPU cycles spent to decrypt
84 # one byte out of 4096-byte buffer with 128-bit key is:
86 # Core 2 9.83
87 # Nehalem 7.74
88 # Atom 19.0
90 # November 2011.
92 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93 # suboptimal, but XTS is meant to be used with larger blocks...
95 # <appro@openssl.org>
97 $flavour = shift;
98 $output = shift;
99 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
101 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
103 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106 die "can't locate x86_64-xlate.pl";
108 open OUT,"| \"$^X\" $xlate $flavour $output";
109 *STDOUT=*OUT;
111 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
112 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
113 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
116 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
118 sub Sbox {
119 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
120 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
121 my @b=@_[0..7];
122 my @t=@_[8..11];
123 my @s=@_[12..15];
124 &InBasisChange (@b);
125 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
126 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
129 sub InBasisChange {
130 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
132 my @b=@_[0..7];
133 $code.=<<___;
134 pxor @b[6], @b[5]
135 pxor @b[1], @b[2]
136 pxor @b[0], @b[3]
137 pxor @b[2], @b[6]
138 pxor @b[0], @b[5]
140 pxor @b[3], @b[6]
141 pxor @b[7], @b[3]
142 pxor @b[5], @b[7]
143 pxor @b[4], @b[3]
144 pxor @b[5], @b[4]
145 pxor @b[1], @b[3]
147 pxor @b[7], @b[2]
148 pxor @b[5], @b[1]
152 sub OutBasisChange {
153 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
154 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
155 my @b=@_[0..7];
156 $code.=<<___;
157 pxor @b[6], @b[0]
158 pxor @b[4], @b[1]
159 pxor @b[0], @b[2]
160 pxor @b[6], @b[4]
161 pxor @b[1], @b[6]
163 pxor @b[5], @b[1]
164 pxor @b[3], @b[5]
165 pxor @b[7], @b[3]
166 pxor @b[5], @b[7]
167 pxor @b[5], @b[2]
169 pxor @b[7], @b[4]
173 sub InvSbox {
174 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
175 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
176 my @b=@_[0..7];
177 my @t=@_[8..11];
178 my @s=@_[12..15];
179 &InvInBasisChange (@b);
180 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
181 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
184 sub InvInBasisChange { # OutBasisChange in reverse
185 my @b=@_[5,1,2,6,3,7,0,4];
186 $code.=<<___
187 pxor @b[7], @b[4]
189 pxor @b[5], @b[7]
190 pxor @b[5], @b[2]
191 pxor @b[7], @b[3]
192 pxor @b[3], @b[5]
193 pxor @b[5], @b[1]
195 pxor @b[1], @b[6]
196 pxor @b[0], @b[2]
197 pxor @b[6], @b[4]
198 pxor @b[6], @b[0]
199 pxor @b[4], @b[1]
203 sub InvOutBasisChange { # InBasisChange in reverse
204 my @b=@_[2,5,7,3,6,1,0,4];
205 $code.=<<___;
206 pxor @b[5], @b[1]
207 pxor @b[7], @b[2]
209 pxor @b[1], @b[3]
210 pxor @b[5], @b[4]
211 pxor @b[5], @b[7]
212 pxor @b[4], @b[3]
213 pxor @b[0], @b[5]
214 pxor @b[7], @b[3]
215 pxor @b[2], @b[6]
216 pxor @b[1], @b[2]
217 pxor @b[3], @b[6]
219 pxor @b[0], @b[3]
220 pxor @b[6], @b[5]
224 sub Mul_GF4 {
225 #;*************************************************************
226 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
227 #;*************************************************************
228 my ($x0,$x1,$y0,$y1,$t0)=@_;
229 $code.=<<___;
230 movdqa $y0, $t0
231 pxor $y1, $t0
232 pand $x0, $t0
233 pxor $x1, $x0
234 pand $y0, $x1
235 pand $y1, $x0
236 pxor $x1, $x0
237 pxor $t0, $x1
241 sub Mul_GF4_N { # not used, see next subroutine
242 # multiply and scale by N
243 my ($x0,$x1,$y0,$y1,$t0)=@_;
244 $code.=<<___;
245 movdqa $y0, $t0
246 pxor $y1, $t0
247 pand $x0, $t0
248 pxor $x1, $x0
249 pand $y0, $x1
250 pand $y1, $x0
251 pxor $x0, $x1
252 pxor $t0, $x0
256 sub Mul_GF4_N_GF4 {
257 # interleaved Mul_GF4_N and Mul_GF4
258 my ($x0,$x1,$y0,$y1,$t0,
259 $x2,$x3,$y2,$y3,$t1)=@_;
260 $code.=<<___;
261 movdqa $y0, $t0
262 movdqa $y2, $t1
263 pxor $y1, $t0
264 pxor $y3, $t1
265 pand $x0, $t0
266 pand $x2, $t1
267 pxor $x1, $x0
268 pxor $x3, $x2
269 pand $y0, $x1
270 pand $y2, $x3
271 pand $y1, $x0
272 pand $y3, $x2
273 pxor $x0, $x1
274 pxor $x3, $x2
275 pxor $t0, $x0
276 pxor $t1, $x3
279 sub Mul_GF16_2 {
280 my @x=@_[0..7];
281 my @y=@_[8..11];
282 my @t=@_[12..15];
283 $code.=<<___;
284 movdqa @x[0], @t[0]
285 movdqa @x[1], @t[1]
287 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
288 $code.=<<___;
289 pxor @x[2], @t[0]
290 pxor @x[3], @t[1]
291 pxor @y[2], @y[0]
292 pxor @y[3], @y[1]
294 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
295 @x[2], @x[3], @y[2], @y[3], @t[2]);
296 $code.=<<___;
297 pxor @t[0], @x[0]
298 pxor @t[0], @x[2]
299 pxor @t[1], @x[1]
300 pxor @t[1], @x[3]
302 movdqa @x[4], @t[0]
303 movdqa @x[5], @t[1]
304 pxor @x[6], @t[0]
305 pxor @x[7], @t[1]
307 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
308 @x[6], @x[7], @y[2], @y[3], @t[2]);
309 $code.=<<___;
310 pxor @y[2], @y[0]
311 pxor @y[3], @y[1]
313 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
314 $code.=<<___;
315 pxor @t[0], @x[4]
316 pxor @t[0], @x[6]
317 pxor @t[1], @x[5]
318 pxor @t[1], @x[7]
321 sub Inv_GF256 {
322 #;********************************************************************
323 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
324 #;********************************************************************
325 my @x=@_[0..7];
326 my @t=@_[8..11];
327 my @s=@_[12..15];
328 # direct optimizations from hardware
329 $code.=<<___;
330 movdqa @x[4], @t[3]
331 movdqa @x[5], @t[2]
332 movdqa @x[1], @t[1]
333 movdqa @x[7], @s[1]
334 movdqa @x[0], @s[0]
336 pxor @x[6], @t[3]
337 pxor @x[7], @t[2]
338 pxor @x[3], @t[1]
339 movdqa @t[3], @s[2]
340 pxor @x[6], @s[1]
341 movdqa @t[2], @t[0]
342 pxor @x[2], @s[0]
343 movdqa @t[3], @s[3]
345 por @t[1], @t[2]
346 por @s[0], @t[3]
347 pxor @t[0], @s[3]
348 pand @s[0], @s[2]
349 pxor @t[1], @s[0]
350 pand @t[1], @t[0]
351 pand @s[0], @s[3]
352 movdqa @x[3], @s[0]
353 pxor @x[2], @s[0]
354 pand @s[0], @s[1]
355 pxor @s[1], @t[3]
356 pxor @s[1], @t[2]
357 movdqa @x[4], @s[1]
358 movdqa @x[1], @s[0]
359 pxor @x[5], @s[1]
360 pxor @x[0], @s[0]
361 movdqa @s[1], @t[1]
362 pand @s[0], @s[1]
363 por @s[0], @t[1]
364 pxor @s[1], @t[0]
365 pxor @s[3], @t[3]
366 pxor @s[2], @t[2]
367 pxor @s[3], @t[1]
368 movdqa @x[7], @s[0]
369 pxor @s[2], @t[0]
370 movdqa @x[6], @s[1]
371 pxor @s[2], @t[1]
372 movdqa @x[5], @s[2]
373 pand @x[3], @s[0]
374 movdqa @x[4], @s[3]
375 pand @x[2], @s[1]
376 pand @x[1], @s[2]
377 por @x[0], @s[3]
378 pxor @s[0], @t[3]
379 pxor @s[1], @t[2]
380 pxor @s[2], @t[1]
381 pxor @s[3], @t[0]
383 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
385 # new smaller inversion
387 movdqa @t[3], @s[0]
388 pand @t[1], @t[3]
389 pxor @t[2], @s[0]
391 movdqa @t[0], @s[2]
392 movdqa @s[0], @s[3]
393 pxor @t[3], @s[2]
394 pand @s[2], @s[3]
396 movdqa @t[1], @s[1]
397 pxor @t[2], @s[3]
398 pxor @t[0], @s[1]
400 pxor @t[2], @t[3]
402 pand @t[3], @s[1]
404 movdqa @s[2], @t[2]
405 pxor @t[0], @s[1]
407 pxor @s[1], @t[2]
408 pxor @s[1], @t[1]
410 pand @t[0], @t[2]
412 pxor @t[2], @s[2]
413 pxor @t[2], @t[1]
415 pand @s[3], @s[2]
417 pxor @s[0], @s[2]
419 # output in s3, s2, s1, t1
421 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
423 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
424 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
426 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
429 # AES linear components
431 sub ShiftRows {
432 my @x=@_[0..7];
433 my $mask=pop;
434 $code.=<<___;
435 pxor 0x00($key),@x[0]
436 pxor 0x10($key),@x[1]
437 pshufb $mask,@x[0]
438 pxor 0x20($key),@x[2]
439 pshufb $mask,@x[1]
440 pxor 0x30($key),@x[3]
441 pshufb $mask,@x[2]
442 pxor 0x40($key),@x[4]
443 pshufb $mask,@x[3]
444 pxor 0x50($key),@x[5]
445 pshufb $mask,@x[4]
446 pxor 0x60($key),@x[6]
447 pshufb $mask,@x[5]
448 pxor 0x70($key),@x[7]
449 pshufb $mask,@x[6]
450 lea 0x80($key),$key
451 pshufb $mask,@x[7]
455 sub MixColumns {
456 # modified to emit output in order suitable for feeding back to aesenc[last]
457 my @x=@_[0..7];
458 my @t=@_[8..15];
459 my $inv=@_[16]; # optional
460 $code.=<<___;
461 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
462 pshufd \$0x93, @x[1], @t[1]
463 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
464 pshufd \$0x93, @x[2], @t[2]
465 pxor @t[1], @x[1]
466 pshufd \$0x93, @x[3], @t[3]
467 pxor @t[2], @x[2]
468 pshufd \$0x93, @x[4], @t[4]
469 pxor @t[3], @x[3]
470 pshufd \$0x93, @x[5], @t[5]
471 pxor @t[4], @x[4]
472 pshufd \$0x93, @x[6], @t[6]
473 pxor @t[5], @x[5]
474 pshufd \$0x93, @x[7], @t[7]
475 pxor @t[6], @x[6]
476 pxor @t[7], @x[7]
478 pxor @x[0], @t[1]
479 pxor @x[7], @t[0]
480 pxor @x[7], @t[1]
481 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
482 pxor @x[1], @t[2]
483 pshufd \$0x4E, @x[1], @x[1]
484 pxor @x[4], @t[5]
485 pxor @t[0], @x[0]
486 pxor @x[5], @t[6]
487 pxor @t[1], @x[1]
488 pxor @x[3], @t[4]
489 pshufd \$0x4E, @x[4], @t[0]
490 pxor @x[6], @t[7]
491 pshufd \$0x4E, @x[5], @t[1]
492 pxor @x[2], @t[3]
493 pshufd \$0x4E, @x[3], @x[4]
494 pxor @x[7], @t[3]
495 pshufd \$0x4E, @x[7], @x[5]
496 pxor @x[7], @t[4]
497 pshufd \$0x4E, @x[6], @x[3]
498 pxor @t[4], @t[0]
499 pshufd \$0x4E, @x[2], @x[6]
500 pxor @t[5], @t[1]
502 $code.=<<___ if (!$inv);
503 pxor @t[3], @x[4]
504 pxor @t[7], @x[5]
505 pxor @t[6], @x[3]
506 movdqa @t[0], @x[2]
507 pxor @t[2], @x[6]
508 movdqa @t[1], @x[7]
510 $code.=<<___ if ($inv);
511 pxor @x[4], @t[3]
512 pxor @t[7], @x[5]
513 pxor @x[3], @t[6]
514 movdqa @t[0], @x[3]
515 pxor @t[2], @x[6]
516 movdqa @t[6], @x[2]
517 movdqa @t[1], @x[7]
518 movdqa @x[6], @x[4]
519 movdqa @t[3], @x[6]
523 sub InvMixColumns_orig {
524 my @x=@_[0..7];
525 my @t=@_[8..15];
527 $code.=<<___;
528 # multiplication by 0x0e
529 pshufd \$0x93, @x[7], @t[7]
530 movdqa @x[2], @t[2]
531 pxor @x[5], @x[7] # 7 5
532 pxor @x[5], @x[2] # 2 5
533 pshufd \$0x93, @x[0], @t[0]
534 movdqa @x[5], @t[5]
535 pxor @x[0], @x[5] # 5 0 [1]
536 pxor @x[1], @x[0] # 0 1
537 pshufd \$0x93, @x[1], @t[1]
538 pxor @x[2], @x[1] # 1 25
539 pxor @x[6], @x[0] # 01 6 [2]
540 pxor @x[3], @x[1] # 125 3 [4]
541 pshufd \$0x93, @x[3], @t[3]
542 pxor @x[0], @x[2] # 25 016 [3]
543 pxor @x[7], @x[3] # 3 75
544 pxor @x[6], @x[7] # 75 6 [0]
545 pshufd \$0x93, @x[6], @t[6]
546 movdqa @x[4], @t[4]
547 pxor @x[4], @x[6] # 6 4
548 pxor @x[3], @x[4] # 4 375 [6]
549 pxor @x[7], @x[3] # 375 756=36
550 pxor @t[5], @x[6] # 64 5 [7]
551 pxor @t[2], @x[3] # 36 2
552 pxor @t[4], @x[3] # 362 4 [5]
553 pshufd \$0x93, @t[5], @t[5]
555 my @y = @x[7,5,0,2,1,3,4,6];
556 $code.=<<___;
557 # multiplication by 0x0b
558 pxor @y[0], @y[1]
559 pxor @t[0], @y[0]
560 pxor @t[1], @y[1]
561 pshufd \$0x93, @t[2], @t[2]
562 pxor @t[5], @y[0]
563 pxor @t[6], @y[1]
564 pxor @t[7], @y[0]
565 pshufd \$0x93, @t[4], @t[4]
566 pxor @t[6], @t[7] # clobber t[7]
567 pxor @y[0], @y[1]
569 pxor @t[0], @y[3]
570 pshufd \$0x93, @t[0], @t[0]
571 pxor @t[1], @y[2]
572 pxor @t[1], @y[4]
573 pxor @t[2], @y[2]
574 pshufd \$0x93, @t[1], @t[1]
575 pxor @t[2], @y[3]
576 pxor @t[2], @y[5]
577 pxor @t[7], @y[2]
578 pshufd \$0x93, @t[2], @t[2]
579 pxor @t[3], @y[3]
580 pxor @t[3], @y[6]
581 pxor @t[3], @y[4]
582 pshufd \$0x93, @t[3], @t[3]
583 pxor @t[4], @y[7]
584 pxor @t[4], @y[5]
585 pxor @t[7], @y[7]
586 pxor @t[5], @y[3]
587 pxor @t[4], @y[4]
588 pxor @t[5], @t[7] # clobber t[7] even more
590 pxor @t[7], @y[5]
591 pshufd \$0x93, @t[4], @t[4]
592 pxor @t[7], @y[6]
593 pxor @t[7], @y[4]
595 pxor @t[5], @t[7]
596 pshufd \$0x93, @t[5], @t[5]
597 pxor @t[6], @t[7] # restore t[7]
599 # multiplication by 0x0d
600 pxor @y[7], @y[4]
601 pxor @t[4], @y[7]
602 pshufd \$0x93, @t[6], @t[6]
603 pxor @t[0], @y[2]
604 pxor @t[5], @y[7]
605 pxor @t[2], @y[2]
606 pshufd \$0x93, @t[7], @t[7]
608 pxor @y[1], @y[3]
609 pxor @t[1], @y[1]
610 pxor @t[0], @y[0]
611 pxor @t[0], @y[3]
612 pxor @t[5], @y[1]
613 pxor @t[5], @y[0]
614 pxor @t[7], @y[1]
615 pshufd \$0x93, @t[0], @t[0]
616 pxor @t[6], @y[0]
617 pxor @y[1], @y[3]
618 pxor @t[1], @y[4]
619 pshufd \$0x93, @t[1], @t[1]
621 pxor @t[7], @y[7]
622 pxor @t[2], @y[4]
623 pxor @t[2], @y[5]
624 pshufd \$0x93, @t[2], @t[2]
625 pxor @t[6], @y[2]
626 pxor @t[3], @t[6] # clobber t[6]
627 pxor @y[7], @y[4]
628 pxor @t[6], @y[3]
630 pxor @t[6], @y[6]
631 pxor @t[5], @y[5]
632 pxor @t[4], @y[6]
633 pshufd \$0x93, @t[4], @t[4]
634 pxor @t[6], @y[5]
635 pxor @t[7], @y[6]
636 pxor @t[3], @t[6] # restore t[6]
638 pshufd \$0x93, @t[5], @t[5]
639 pshufd \$0x93, @t[6], @t[6]
640 pshufd \$0x93, @t[7], @t[7]
641 pshufd \$0x93, @t[3], @t[3]
643 # multiplication by 0x09
644 pxor @y[1], @y[4]
645 pxor @y[1], @t[1] # t[1]=y[1]
646 pxor @t[5], @t[0] # clobber t[0]
647 pxor @t[5], @t[1]
648 pxor @t[0], @y[3]
649 pxor @y[0], @t[0] # t[0]=y[0]
650 pxor @t[6], @t[1]
651 pxor @t[7], @t[6] # clobber t[6]
652 pxor @t[1], @y[4]
653 pxor @t[4], @y[7]
654 pxor @y[4], @t[4] # t[4]=y[4]
655 pxor @t[3], @y[6]
656 pxor @y[3], @t[3] # t[3]=y[3]
657 pxor @t[2], @y[5]
658 pxor @y[2], @t[2] # t[2]=y[2]
659 pxor @t[7], @t[3]
660 pxor @y[5], @t[5] # t[5]=y[5]
661 pxor @t[6], @t[2]
662 pxor @t[6], @t[5]
663 pxor @y[6], @t[6] # t[6]=y[6]
664 pxor @y[7], @t[7] # t[7]=y[7]
666 movdqa @t[0],@XMM[0]
667 movdqa @t[1],@XMM[1]
668 movdqa @t[2],@XMM[2]
669 movdqa @t[3],@XMM[3]
670 movdqa @t[4],@XMM[4]
671 movdqa @t[5],@XMM[5]
672 movdqa @t[6],@XMM[6]
673 movdqa @t[7],@XMM[7]
677 sub InvMixColumns {
678 my @x=@_[0..7];
679 my @t=@_[8..15];
681 # Thanks to Jussi Kivilinna for providing pointer to
683 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
684 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
685 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
686 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
688 $code.=<<___;
689 # multiplication by 0x05-0x00-0x04-0x00
690 pshufd \$0x4E, @x[0], @t[0]
691 pshufd \$0x4E, @x[6], @t[6]
692 pxor @x[0], @t[0]
693 pshufd \$0x4E, @x[7], @t[7]
694 pxor @x[6], @t[6]
695 pshufd \$0x4E, @x[1], @t[1]
696 pxor @x[7], @t[7]
697 pshufd \$0x4E, @x[2], @t[2]
698 pxor @x[1], @t[1]
699 pshufd \$0x4E, @x[3], @t[3]
700 pxor @x[2], @t[2]
701 pxor @t[6], @x[0]
702 pxor @t[6], @x[1]
703 pshufd \$0x4E, @x[4], @t[4]
704 pxor @x[3], @t[3]
705 pxor @t[0], @x[2]
706 pxor @t[1], @x[3]
707 pshufd \$0x4E, @x[5], @t[5]
708 pxor @x[4], @t[4]
709 pxor @t[7], @x[1]
710 pxor @t[2], @x[4]
711 pxor @x[5], @t[5]
713 pxor @t[7], @x[2]
714 pxor @t[6], @x[3]
715 pxor @t[6], @x[4]
716 pxor @t[3], @x[5]
717 pxor @t[4], @x[6]
718 pxor @t[7], @x[4]
719 pxor @t[7], @x[5]
720 pxor @t[5], @x[7]
722 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
725 sub aesenc { # not used
726 my @b=@_[0..7];
727 my @t=@_[8..15];
728 $code.=<<___;
729 movdqa 0x30($const),@t[0] # .LSR
731 &ShiftRows (@b,@t[0]);
732 &Sbox (@b,@t);
733 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
736 sub aesenclast { # not used
737 my @b=@_[0..7];
738 my @t=@_[8..15];
739 $code.=<<___;
740 movdqa 0x40($const),@t[0] # .LSRM0
742 &ShiftRows (@b,@t[0]);
743 &Sbox (@b,@t);
744 $code.=<<___
745 pxor 0x00($key),@b[0]
746 pxor 0x10($key),@b[1]
747 pxor 0x20($key),@b[4]
748 pxor 0x30($key),@b[6]
749 pxor 0x40($key),@b[3]
750 pxor 0x50($key),@b[7]
751 pxor 0x60($key),@b[2]
752 pxor 0x70($key),@b[5]
756 sub swapmove {
757 my ($a,$b,$n,$mask,$t)=@_;
758 $code.=<<___;
759 movdqa $b,$t
760 psrlq \$$n,$b
761 pxor $a,$b
762 pand $mask,$b
763 pxor $b,$a
764 psllq \$$n,$b
765 pxor $t,$b
768 sub swapmove2x {
769 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
770 $code.=<<___;
771 movdqa $b0,$t0
772 psrlq \$$n,$b0
773 movdqa $b1,$t1
774 psrlq \$$n,$b1
775 pxor $a0,$b0
776 pxor $a1,$b1
777 pand $mask,$b0
778 pand $mask,$b1
779 pxor $b0,$a0
780 psllq \$$n,$b0
781 pxor $b1,$a1
782 psllq \$$n,$b1
783 pxor $t0,$b0
784 pxor $t1,$b1
788 sub bitslice {
789 my @x=reverse(@_[0..7]);
790 my ($t0,$t1,$t2,$t3)=@_[8..11];
791 $code.=<<___;
792 movdqa 0x00($const),$t0 # .LBS0
793 movdqa 0x10($const),$t1 # .LBS1
795 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
796 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
797 $code.=<<___;
798 movdqa 0x20($const),$t0 # .LBS2
800 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
801 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
803 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
804 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
807 $code.=<<___;
808 .text
810 .extern asm_AES_encrypt
811 .extern asm_AES_decrypt
813 .type _bsaes_encrypt8,\@abi-omnipotent
814 .align 64
815 _bsaes_encrypt8:
816 lea .LBS0(%rip), $const # constants table
818 movdqa ($key), @XMM[9] # round 0 key
819 lea 0x10($key), $key
820 movdqa 0x50($const), @XMM[8] # .LM0SR
821 pxor @XMM[9], @XMM[0] # xor with round0 key
822 pxor @XMM[9], @XMM[1]
823 pshufb @XMM[8], @XMM[0]
824 pxor @XMM[9], @XMM[2]
825 pshufb @XMM[8], @XMM[1]
826 pxor @XMM[9], @XMM[3]
827 pshufb @XMM[8], @XMM[2]
828 pxor @XMM[9], @XMM[4]
829 pshufb @XMM[8], @XMM[3]
830 pxor @XMM[9], @XMM[5]
831 pshufb @XMM[8], @XMM[4]
832 pxor @XMM[9], @XMM[6]
833 pshufb @XMM[8], @XMM[5]
834 pxor @XMM[9], @XMM[7]
835 pshufb @XMM[8], @XMM[6]
836 pshufb @XMM[8], @XMM[7]
837 _bsaes_encrypt8_bitslice:
839 &bitslice (@XMM[0..7, 8..11]);
840 $code.=<<___;
841 dec $rounds
842 jmp .Lenc_sbox
843 .align 16
844 .Lenc_loop:
846 &ShiftRows (@XMM[0..7, 8]);
847 $code.=".Lenc_sbox:\n";
848 &Sbox (@XMM[0..7, 8..15]);
849 $code.=<<___;
850 dec $rounds
851 jl .Lenc_done
853 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
854 $code.=<<___;
855 movdqa 0x30($const), @XMM[8] # .LSR
856 jnz .Lenc_loop
857 movdqa 0x40($const), @XMM[8] # .LSRM0
858 jmp .Lenc_loop
859 .align 16
860 .Lenc_done:
862 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
863 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
864 $code.=<<___;
865 movdqa ($key), @XMM[8] # last round key
866 pxor @XMM[8], @XMM[4]
867 pxor @XMM[8], @XMM[6]
868 pxor @XMM[8], @XMM[3]
869 pxor @XMM[8], @XMM[7]
870 pxor @XMM[8], @XMM[2]
871 pxor @XMM[8], @XMM[5]
872 pxor @XMM[8], @XMM[0]
873 pxor @XMM[8], @XMM[1]
875 .size _bsaes_encrypt8,.-_bsaes_encrypt8
877 .type _bsaes_decrypt8,\@abi-omnipotent
878 .align 64
879 _bsaes_decrypt8:
880 lea .LBS0(%rip), $const # constants table
882 movdqa ($key), @XMM[9] # round 0 key
883 lea 0x10($key), $key
884 movdqa -0x30($const), @XMM[8] # .LM0ISR
885 pxor @XMM[9], @XMM[0] # xor with round0 key
886 pxor @XMM[9], @XMM[1]
887 pshufb @XMM[8], @XMM[0]
888 pxor @XMM[9], @XMM[2]
889 pshufb @XMM[8], @XMM[1]
890 pxor @XMM[9], @XMM[3]
891 pshufb @XMM[8], @XMM[2]
892 pxor @XMM[9], @XMM[4]
893 pshufb @XMM[8], @XMM[3]
894 pxor @XMM[9], @XMM[5]
895 pshufb @XMM[8], @XMM[4]
896 pxor @XMM[9], @XMM[6]
897 pshufb @XMM[8], @XMM[5]
898 pxor @XMM[9], @XMM[7]
899 pshufb @XMM[8], @XMM[6]
900 pshufb @XMM[8], @XMM[7]
902 &bitslice (@XMM[0..7, 8..11]);
903 $code.=<<___;
904 dec $rounds
905 jmp .Ldec_sbox
906 .align 16
907 .Ldec_loop:
909 &ShiftRows (@XMM[0..7, 8]);
910 $code.=".Ldec_sbox:\n";
911 &InvSbox (@XMM[0..7, 8..15]);
912 $code.=<<___;
913 dec $rounds
914 jl .Ldec_done
916 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
917 $code.=<<___;
918 movdqa -0x10($const), @XMM[8] # .LISR
919 jnz .Ldec_loop
920 movdqa -0x20($const), @XMM[8] # .LISRM0
921 jmp .Ldec_loop
922 .align 16
923 .Ldec_done:
925 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
926 $code.=<<___;
927 movdqa ($key), @XMM[8] # last round key
928 pxor @XMM[8], @XMM[6]
929 pxor @XMM[8], @XMM[4]
930 pxor @XMM[8], @XMM[2]
931 pxor @XMM[8], @XMM[7]
932 pxor @XMM[8], @XMM[3]
933 pxor @XMM[8], @XMM[5]
934 pxor @XMM[8], @XMM[0]
935 pxor @XMM[8], @XMM[1]
937 .size _bsaes_decrypt8,.-_bsaes_decrypt8
941 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
943 sub bitslice_key {
944 my @x=reverse(@_[0..7]);
945 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
947 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
948 $code.=<<___;
949 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
950 movdqa @x[0], @x[2]
951 movdqa @x[1], @x[3]
953 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
955 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
956 $code.=<<___;
957 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
958 movdqa @x[0], @x[4]
959 movdqa @x[2], @x[6]
960 movdqa @x[1], @x[5]
961 movdqa @x[3], @x[7]
963 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
964 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
967 $code.=<<___;
968 .type _bsaes_key_convert,\@abi-omnipotent
969 .align 16
970 _bsaes_key_convert:
971 lea .Lmasks(%rip), $const
972 movdqu ($inp), %xmm7 # load round 0 key
973 lea 0x10($inp), $inp
974 movdqa 0x00($const), %xmm0 # 0x01...
975 movdqa 0x10($const), %xmm1 # 0x02...
976 movdqa 0x20($const), %xmm2 # 0x04...
977 movdqa 0x30($const), %xmm3 # 0x08...
978 movdqa 0x40($const), %xmm4 # .LM0
979 pcmpeqd %xmm5, %xmm5 # .LNOT
981 movdqu ($inp), %xmm6 # load round 1 key
982 movdqa %xmm7, ($out) # save round 0 key
983 lea 0x10($out), $out
984 dec $rounds
985 jmp .Lkey_loop
986 .align 16
987 .Lkey_loop:
988 pshufb %xmm4, %xmm6 # .LM0
990 movdqa %xmm0, %xmm8
991 movdqa %xmm1, %xmm9
993 pand %xmm6, %xmm8
994 pand %xmm6, %xmm9
995 movdqa %xmm2, %xmm10
996 pcmpeqb %xmm0, %xmm8
997 psllq \$4, %xmm0 # 0x10...
998 movdqa %xmm3, %xmm11
999 pcmpeqb %xmm1, %xmm9
1000 psllq \$4, %xmm1 # 0x20...
1002 pand %xmm6, %xmm10
1003 pand %xmm6, %xmm11
1004 movdqa %xmm0, %xmm12
1005 pcmpeqb %xmm2, %xmm10
1006 psllq \$4, %xmm2 # 0x40...
1007 movdqa %xmm1, %xmm13
1008 pcmpeqb %xmm3, %xmm11
1009 psllq \$4, %xmm3 # 0x80...
1011 movdqa %xmm2, %xmm14
1012 movdqa %xmm3, %xmm15
1013 pxor %xmm5, %xmm8 # "pnot"
1014 pxor %xmm5, %xmm9
1016 pand %xmm6, %xmm12
1017 pand %xmm6, %xmm13
1018 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1019 pcmpeqb %xmm0, %xmm12
1020 psrlq \$4, %xmm0 # 0x01...
1021 movdqa %xmm9, 0x10($out)
1022 pcmpeqb %xmm1, %xmm13
1023 psrlq \$4, %xmm1 # 0x02...
1024 lea 0x10($inp), $inp
1026 pand %xmm6, %xmm14
1027 pand %xmm6, %xmm15
1028 movdqa %xmm10, 0x20($out)
1029 pcmpeqb %xmm2, %xmm14
1030 psrlq \$4, %xmm2 # 0x04...
1031 movdqa %xmm11, 0x30($out)
1032 pcmpeqb %xmm3, %xmm15
1033 psrlq \$4, %xmm3 # 0x08...
1034 movdqu ($inp), %xmm6 # load next round key
1036 pxor %xmm5, %xmm13 # "pnot"
1037 pxor %xmm5, %xmm14
1038 movdqa %xmm12, 0x40($out)
1039 movdqa %xmm13, 0x50($out)
1040 movdqa %xmm14, 0x60($out)
1041 movdqa %xmm15, 0x70($out)
1042 lea 0x80($out),$out
1043 dec $rounds
1044 jnz .Lkey_loop
1046 movdqa 0x50($const), %xmm7 # .L63
1047 #movdqa %xmm6, ($out) # don't save last round key
1049 .size _bsaes_key_convert,.-_bsaes_key_convert
1053 if (0 && !$win64) { # following four functions are unsupported interface
1054 # used for benchmarking...
1055 $code.=<<___;
1056 .globl bsaes_enc_key_convert
1057 .type bsaes_enc_key_convert,\@function,2
1058 .align 16
1059 bsaes_enc_key_convert:
1060 mov 240($inp),%r10d # pass rounds
1061 mov $inp,%rcx # pass key
1062 mov $out,%rax # pass key schedule
1063 call _bsaes_key_convert
1064 pxor %xmm6,%xmm7 # fix up last round key
1065 movdqa %xmm7,(%rax) # save last round key
1067 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1069 .globl bsaes_encrypt_128
1070 .type bsaes_encrypt_128,\@function,4
1071 .align 16
1072 bsaes_encrypt_128:
1073 .Lenc128_loop:
1074 movdqu 0x00($inp), @XMM[0] # load input
1075 movdqu 0x10($inp), @XMM[1]
1076 movdqu 0x20($inp), @XMM[2]
1077 movdqu 0x30($inp), @XMM[3]
1078 movdqu 0x40($inp), @XMM[4]
1079 movdqu 0x50($inp), @XMM[5]
1080 movdqu 0x60($inp), @XMM[6]
1081 movdqu 0x70($inp), @XMM[7]
1082 mov $key, %rax # pass the $key
1083 lea 0x80($inp), $inp
1084 mov \$10,%r10d
1086 call _bsaes_encrypt8
1088 movdqu @XMM[0], 0x00($out) # write output
1089 movdqu @XMM[1], 0x10($out)
1090 movdqu @XMM[4], 0x20($out)
1091 movdqu @XMM[6], 0x30($out)
1092 movdqu @XMM[3], 0x40($out)
1093 movdqu @XMM[7], 0x50($out)
1094 movdqu @XMM[2], 0x60($out)
1095 movdqu @XMM[5], 0x70($out)
1096 lea 0x80($out), $out
1097 sub \$0x80,$len
1098 ja .Lenc128_loop
1100 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1102 .globl bsaes_dec_key_convert
1103 .type bsaes_dec_key_convert,\@function,2
1104 .align 16
1105 bsaes_dec_key_convert:
1106 mov 240($inp),%r10d # pass rounds
1107 mov $inp,%rcx # pass key
1108 mov $out,%rax # pass key schedule
1109 call _bsaes_key_convert
1110 pxor ($out),%xmm7 # fix up round 0 key
1111 movdqa %xmm6,(%rax) # save last round key
1112 movdqa %xmm7,($out)
1114 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1116 .globl bsaes_decrypt_128
1117 .type bsaes_decrypt_128,\@function,4
1118 .align 16
1119 bsaes_decrypt_128:
1120 .Ldec128_loop:
1121 movdqu 0x00($inp), @XMM[0] # load input
1122 movdqu 0x10($inp), @XMM[1]
1123 movdqu 0x20($inp), @XMM[2]
1124 movdqu 0x30($inp), @XMM[3]
1125 movdqu 0x40($inp), @XMM[4]
1126 movdqu 0x50($inp), @XMM[5]
1127 movdqu 0x60($inp), @XMM[6]
1128 movdqu 0x70($inp), @XMM[7]
1129 mov $key, %rax # pass the $key
1130 lea 0x80($inp), $inp
1131 mov \$10,%r10d
1133 call _bsaes_decrypt8
1135 movdqu @XMM[0], 0x00($out) # write output
1136 movdqu @XMM[1], 0x10($out)
1137 movdqu @XMM[6], 0x20($out)
1138 movdqu @XMM[4], 0x30($out)
1139 movdqu @XMM[2], 0x40($out)
1140 movdqu @XMM[7], 0x50($out)
1141 movdqu @XMM[3], 0x60($out)
1142 movdqu @XMM[5], 0x70($out)
1143 lea 0x80($out), $out
1144 sub \$0x80,$len
1145 ja .Ldec128_loop
1147 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1151 ######################################################################
1153 # OpenSSL interface
1155 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1156 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1157 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1159 if ($ecb) {
1160 $code.=<<___;
1161 .globl bsaes_ecb_encrypt_blocks
1162 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1163 .align 16
1164 bsaes_ecb_encrypt_blocks:
1165 mov %rsp, %rax
1166 .Lecb_enc_prologue:
1167 push %rbp
1168 push %rbx
1169 push %r12
1170 push %r13
1171 push %r14
1172 push %r15
1173 lea -0x48(%rsp),%rsp
1175 $code.=<<___ if ($win64);
1176 lea -0xa0(%rsp), %rsp
1177 movaps %xmm6, 0x40(%rsp)
1178 movaps %xmm7, 0x50(%rsp)
1179 movaps %xmm8, 0x60(%rsp)
1180 movaps %xmm9, 0x70(%rsp)
1181 movaps %xmm10, 0x80(%rsp)
1182 movaps %xmm11, 0x90(%rsp)
1183 movaps %xmm12, 0xa0(%rsp)
1184 movaps %xmm13, 0xb0(%rsp)
1185 movaps %xmm14, 0xc0(%rsp)
1186 movaps %xmm15, 0xd0(%rsp)
1187 .Lecb_enc_body:
1189 $code.=<<___;
1190 mov %rsp,%rbp # backup %rsp
1191 mov 240($arg4),%eax # rounds
1192 mov $arg1,$inp # backup arguments
1193 mov $arg2,$out
1194 mov $arg3,$len
1195 mov $arg4,$key
1196 cmp \$8,$arg3
1197 jb .Lecb_enc_short
1199 mov %eax,%ebx # backup rounds
1200 shl \$7,%rax # 128 bytes per inner round key
1201 sub \$`128-32`,%rax # size of bit-sliced key schedule
1202 sub %rax,%rsp
1203 mov %rsp,%rax # pass key schedule
1204 mov $key,%rcx # pass key
1205 mov %ebx,%r10d # pass rounds
1206 call _bsaes_key_convert
1207 pxor %xmm6,%xmm7 # fix up last round key
1208 movdqa %xmm7,(%rax) # save last round key
1210 sub \$8,$len
1211 .Lecb_enc_loop:
1212 movdqu 0x00($inp), @XMM[0] # load input
1213 movdqu 0x10($inp), @XMM[1]
1214 movdqu 0x20($inp), @XMM[2]
1215 movdqu 0x30($inp), @XMM[3]
1216 movdqu 0x40($inp), @XMM[4]
1217 movdqu 0x50($inp), @XMM[5]
1218 mov %rsp, %rax # pass key schedule
1219 movdqu 0x60($inp), @XMM[6]
1220 mov %ebx,%r10d # pass rounds
1221 movdqu 0x70($inp), @XMM[7]
1222 lea 0x80($inp), $inp
1224 call _bsaes_encrypt8
1226 movdqu @XMM[0], 0x00($out) # write output
1227 movdqu @XMM[1], 0x10($out)
1228 movdqu @XMM[4], 0x20($out)
1229 movdqu @XMM[6], 0x30($out)
1230 movdqu @XMM[3], 0x40($out)
1231 movdqu @XMM[7], 0x50($out)
1232 movdqu @XMM[2], 0x60($out)
1233 movdqu @XMM[5], 0x70($out)
1234 lea 0x80($out), $out
1235 sub \$8,$len
1236 jnc .Lecb_enc_loop
1238 add \$8,$len
1239 jz .Lecb_enc_done
1241 movdqu 0x00($inp), @XMM[0] # load input
1242 mov %rsp, %rax # pass key schedule
1243 mov %ebx,%r10d # pass rounds
1244 cmp \$2,$len
1245 jb .Lecb_enc_one
1246 movdqu 0x10($inp), @XMM[1]
1247 je .Lecb_enc_two
1248 movdqu 0x20($inp), @XMM[2]
1249 cmp \$4,$len
1250 jb .Lecb_enc_three
1251 movdqu 0x30($inp), @XMM[3]
1252 je .Lecb_enc_four
1253 movdqu 0x40($inp), @XMM[4]
1254 cmp \$6,$len
1255 jb .Lecb_enc_five
1256 movdqu 0x50($inp), @XMM[5]
1257 je .Lecb_enc_six
1258 movdqu 0x60($inp), @XMM[6]
1259 call _bsaes_encrypt8
1260 movdqu @XMM[0], 0x00($out) # write output
1261 movdqu @XMM[1], 0x10($out)
1262 movdqu @XMM[4], 0x20($out)
1263 movdqu @XMM[6], 0x30($out)
1264 movdqu @XMM[3], 0x40($out)
1265 movdqu @XMM[7], 0x50($out)
1266 movdqu @XMM[2], 0x60($out)
1267 jmp .Lecb_enc_done
1268 .align 16
1269 .Lecb_enc_six:
1270 call _bsaes_encrypt8
1271 movdqu @XMM[0], 0x00($out) # write output
1272 movdqu @XMM[1], 0x10($out)
1273 movdqu @XMM[4], 0x20($out)
1274 movdqu @XMM[6], 0x30($out)
1275 movdqu @XMM[3], 0x40($out)
1276 movdqu @XMM[7], 0x50($out)
1277 jmp .Lecb_enc_done
1278 .align 16
1279 .Lecb_enc_five:
1280 call _bsaes_encrypt8
1281 movdqu @XMM[0], 0x00($out) # write output
1282 movdqu @XMM[1], 0x10($out)
1283 movdqu @XMM[4], 0x20($out)
1284 movdqu @XMM[6], 0x30($out)
1285 movdqu @XMM[3], 0x40($out)
1286 jmp .Lecb_enc_done
1287 .align 16
1288 .Lecb_enc_four:
1289 call _bsaes_encrypt8
1290 movdqu @XMM[0], 0x00($out) # write output
1291 movdqu @XMM[1], 0x10($out)
1292 movdqu @XMM[4], 0x20($out)
1293 movdqu @XMM[6], 0x30($out)
1294 jmp .Lecb_enc_done
1295 .align 16
1296 .Lecb_enc_three:
1297 call _bsaes_encrypt8
1298 movdqu @XMM[0], 0x00($out) # write output
1299 movdqu @XMM[1], 0x10($out)
1300 movdqu @XMM[4], 0x20($out)
1301 jmp .Lecb_enc_done
1302 .align 16
1303 .Lecb_enc_two:
1304 call _bsaes_encrypt8
1305 movdqu @XMM[0], 0x00($out) # write output
1306 movdqu @XMM[1], 0x10($out)
1307 jmp .Lecb_enc_done
1308 .align 16
1309 .Lecb_enc_one:
1310 call _bsaes_encrypt8
1311 movdqu @XMM[0], 0x00($out) # write output
1312 jmp .Lecb_enc_done
1313 .align 16
1314 .Lecb_enc_short:
1315 lea ($inp), $arg1
1316 lea ($out), $arg2
1317 lea ($key), $arg3
1318 call asm_AES_encrypt
1319 lea 16($inp), $inp
1320 lea 16($out), $out
1321 dec $len
1322 jnz .Lecb_enc_short
1324 .Lecb_enc_done:
1325 lea (%rsp),%rax
1326 pxor %xmm0, %xmm0
1327 .Lecb_enc_bzero: # wipe key schedule [if any]
1328 movdqa %xmm0, 0x00(%rax)
1329 movdqa %xmm0, 0x10(%rax)
1330 lea 0x20(%rax), %rax
1331 cmp %rax, %rbp
1332 jb .Lecb_enc_bzero
1334 lea (%rbp),%rsp # restore %rsp
1336 $code.=<<___ if ($win64);
1337 movaps 0x40(%rbp), %xmm6
1338 movaps 0x50(%rbp), %xmm7
1339 movaps 0x60(%rbp), %xmm8
1340 movaps 0x70(%rbp), %xmm9
1341 movaps 0x80(%rbp), %xmm10
1342 movaps 0x90(%rbp), %xmm11
1343 movaps 0xa0(%rbp), %xmm12
1344 movaps 0xb0(%rbp), %xmm13
1345 movaps 0xc0(%rbp), %xmm14
1346 movaps 0xd0(%rbp), %xmm15
1347 lea 0xa0(%rbp), %rsp
1349 $code.=<<___;
1350 mov 0x48(%rsp), %r15
1351 mov 0x50(%rsp), %r14
1352 mov 0x58(%rsp), %r13
1353 mov 0x60(%rsp), %r12
1354 mov 0x68(%rsp), %rbx
1355 mov 0x70(%rsp), %rax
1356 lea 0x78(%rsp), %rsp
1357 mov %rax, %rbp
1358 .Lecb_enc_epilogue:
1360 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1362 .globl bsaes_ecb_decrypt_blocks
1363 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1364 .align 16
1365 bsaes_ecb_decrypt_blocks:
1366 mov %rsp, %rax
1367 .Lecb_dec_prologue:
1368 push %rbp
1369 push %rbx
1370 push %r12
1371 push %r13
1372 push %r14
1373 push %r15
1374 lea -0x48(%rsp),%rsp
1376 $code.=<<___ if ($win64);
1377 lea -0xa0(%rsp), %rsp
1378 movaps %xmm6, 0x40(%rsp)
1379 movaps %xmm7, 0x50(%rsp)
1380 movaps %xmm8, 0x60(%rsp)
1381 movaps %xmm9, 0x70(%rsp)
1382 movaps %xmm10, 0x80(%rsp)
1383 movaps %xmm11, 0x90(%rsp)
1384 movaps %xmm12, 0xa0(%rsp)
1385 movaps %xmm13, 0xb0(%rsp)
1386 movaps %xmm14, 0xc0(%rsp)
1387 movaps %xmm15, 0xd0(%rsp)
1388 .Lecb_dec_body:
1390 $code.=<<___;
1391 mov %rsp,%rbp # backup %rsp
1392 mov 240($arg4),%eax # rounds
1393 mov $arg1,$inp # backup arguments
1394 mov $arg2,$out
1395 mov $arg3,$len
1396 mov $arg4,$key
1397 cmp \$8,$arg3
1398 jb .Lecb_dec_short
1400 mov %eax,%ebx # backup rounds
1401 shl \$7,%rax # 128 bytes per inner round key
1402 sub \$`128-32`,%rax # size of bit-sliced key schedule
1403 sub %rax,%rsp
1404 mov %rsp,%rax # pass key schedule
1405 mov $key,%rcx # pass key
1406 mov %ebx,%r10d # pass rounds
1407 call _bsaes_key_convert
1408 pxor (%rsp),%xmm7 # fix up 0 round key
1409 movdqa %xmm6,(%rax) # save last round key
1410 movdqa %xmm7,(%rsp)
1412 sub \$8,$len
1413 .Lecb_dec_loop:
1414 movdqu 0x00($inp), @XMM[0] # load input
1415 movdqu 0x10($inp), @XMM[1]
1416 movdqu 0x20($inp), @XMM[2]
1417 movdqu 0x30($inp), @XMM[3]
1418 movdqu 0x40($inp), @XMM[4]
1419 movdqu 0x50($inp), @XMM[5]
1420 mov %rsp, %rax # pass key schedule
1421 movdqu 0x60($inp), @XMM[6]
1422 mov %ebx,%r10d # pass rounds
1423 movdqu 0x70($inp), @XMM[7]
1424 lea 0x80($inp), $inp
1426 call _bsaes_decrypt8
1428 movdqu @XMM[0], 0x00($out) # write output
1429 movdqu @XMM[1], 0x10($out)
1430 movdqu @XMM[6], 0x20($out)
1431 movdqu @XMM[4], 0x30($out)
1432 movdqu @XMM[2], 0x40($out)
1433 movdqu @XMM[7], 0x50($out)
1434 movdqu @XMM[3], 0x60($out)
1435 movdqu @XMM[5], 0x70($out)
1436 lea 0x80($out), $out
1437 sub \$8,$len
1438 jnc .Lecb_dec_loop
1440 add \$8,$len
1441 jz .Lecb_dec_done
1443 movdqu 0x00($inp), @XMM[0] # load input
1444 mov %rsp, %rax # pass key schedule
1445 mov %ebx,%r10d # pass rounds
1446 cmp \$2,$len
1447 jb .Lecb_dec_one
1448 movdqu 0x10($inp), @XMM[1]
1449 je .Lecb_dec_two
1450 movdqu 0x20($inp), @XMM[2]
1451 cmp \$4,$len
1452 jb .Lecb_dec_three
1453 movdqu 0x30($inp), @XMM[3]
1454 je .Lecb_dec_four
1455 movdqu 0x40($inp), @XMM[4]
1456 cmp \$6,$len
1457 jb .Lecb_dec_five
1458 movdqu 0x50($inp), @XMM[5]
1459 je .Lecb_dec_six
1460 movdqu 0x60($inp), @XMM[6]
1461 call _bsaes_decrypt8
1462 movdqu @XMM[0], 0x00($out) # write output
1463 movdqu @XMM[1], 0x10($out)
1464 movdqu @XMM[6], 0x20($out)
1465 movdqu @XMM[4], 0x30($out)
1466 movdqu @XMM[2], 0x40($out)
1467 movdqu @XMM[7], 0x50($out)
1468 movdqu @XMM[3], 0x60($out)
1469 jmp .Lecb_dec_done
1470 .align 16
1471 .Lecb_dec_six:
1472 call _bsaes_decrypt8
1473 movdqu @XMM[0], 0x00($out) # write output
1474 movdqu @XMM[1], 0x10($out)
1475 movdqu @XMM[6], 0x20($out)
1476 movdqu @XMM[4], 0x30($out)
1477 movdqu @XMM[2], 0x40($out)
1478 movdqu @XMM[7], 0x50($out)
1479 jmp .Lecb_dec_done
1480 .align 16
1481 .Lecb_dec_five:
1482 call _bsaes_decrypt8
1483 movdqu @XMM[0], 0x00($out) # write output
1484 movdqu @XMM[1], 0x10($out)
1485 movdqu @XMM[6], 0x20($out)
1486 movdqu @XMM[4], 0x30($out)
1487 movdqu @XMM[2], 0x40($out)
1488 jmp .Lecb_dec_done
1489 .align 16
1490 .Lecb_dec_four:
1491 call _bsaes_decrypt8
1492 movdqu @XMM[0], 0x00($out) # write output
1493 movdqu @XMM[1], 0x10($out)
1494 movdqu @XMM[6], 0x20($out)
1495 movdqu @XMM[4], 0x30($out)
1496 jmp .Lecb_dec_done
1497 .align 16
1498 .Lecb_dec_three:
1499 call _bsaes_decrypt8
1500 movdqu @XMM[0], 0x00($out) # write output
1501 movdqu @XMM[1], 0x10($out)
1502 movdqu @XMM[6], 0x20($out)
1503 jmp .Lecb_dec_done
1504 .align 16
1505 .Lecb_dec_two:
1506 call _bsaes_decrypt8
1507 movdqu @XMM[0], 0x00($out) # write output
1508 movdqu @XMM[1], 0x10($out)
1509 jmp .Lecb_dec_done
1510 .align 16
1511 .Lecb_dec_one:
1512 call _bsaes_decrypt8
1513 movdqu @XMM[0], 0x00($out) # write output
1514 jmp .Lecb_dec_done
1515 .align 16
1516 .Lecb_dec_short:
1517 lea ($inp), $arg1
1518 lea ($out), $arg2
1519 lea ($key), $arg3
1520 call asm_AES_decrypt
1521 lea 16($inp), $inp
1522 lea 16($out), $out
1523 dec $len
1524 jnz .Lecb_dec_short
1526 .Lecb_dec_done:
1527 lea (%rsp),%rax
1528 pxor %xmm0, %xmm0
1529 .Lecb_dec_bzero: # wipe key schedule [if any]
1530 movdqa %xmm0, 0x00(%rax)
1531 movdqa %xmm0, 0x10(%rax)
1532 lea 0x20(%rax), %rax
1533 cmp %rax, %rbp
1534 jb .Lecb_dec_bzero
1536 lea (%rbp),%rsp # restore %rsp
1538 $code.=<<___ if ($win64);
1539 movaps 0x40(%rbp), %xmm6
1540 movaps 0x50(%rbp), %xmm7
1541 movaps 0x60(%rbp), %xmm8
1542 movaps 0x70(%rbp), %xmm9
1543 movaps 0x80(%rbp), %xmm10
1544 movaps 0x90(%rbp), %xmm11
1545 movaps 0xa0(%rbp), %xmm12
1546 movaps 0xb0(%rbp), %xmm13
1547 movaps 0xc0(%rbp), %xmm14
1548 movaps 0xd0(%rbp), %xmm15
1549 lea 0xa0(%rbp), %rsp
1551 $code.=<<___;
1552 mov 0x48(%rsp), %r15
1553 mov 0x50(%rsp), %r14
1554 mov 0x58(%rsp), %r13
1555 mov 0x60(%rsp), %r12
1556 mov 0x68(%rsp), %rbx
1557 mov 0x70(%rsp), %rax
1558 lea 0x78(%rsp), %rsp
1559 mov %rax, %rbp
1560 .Lecb_dec_epilogue:
1562 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1565 $code.=<<___;
1566 .extern asm_AES_cbc_encrypt
1567 .globl bsaes_cbc_encrypt
1568 .type bsaes_cbc_encrypt,\@abi-omnipotent
1569 .align 16
1570 bsaes_cbc_encrypt:
1572 $code.=<<___ if ($win64);
1573 mov 48(%rsp),$arg6 # pull direction flag
1575 $code.=<<___;
1576 cmp \$0,$arg6
1577 jne asm_AES_cbc_encrypt
1578 cmp \$128,$arg3
1579 jb asm_AES_cbc_encrypt
1581 mov %rsp, %rax
1582 .Lcbc_dec_prologue:
1583 push %rbp
1584 push %rbx
1585 push %r12
1586 push %r13
1587 push %r14
1588 push %r15
1589 lea -0x48(%rsp), %rsp
1591 $code.=<<___ if ($win64);
1592 mov 0xa0(%rsp),$arg5 # pull ivp
1593 lea -0xa0(%rsp), %rsp
1594 movaps %xmm6, 0x40(%rsp)
1595 movaps %xmm7, 0x50(%rsp)
1596 movaps %xmm8, 0x60(%rsp)
1597 movaps %xmm9, 0x70(%rsp)
1598 movaps %xmm10, 0x80(%rsp)
1599 movaps %xmm11, 0x90(%rsp)
1600 movaps %xmm12, 0xa0(%rsp)
1601 movaps %xmm13, 0xb0(%rsp)
1602 movaps %xmm14, 0xc0(%rsp)
1603 movaps %xmm15, 0xd0(%rsp)
1604 .Lcbc_dec_body:
1606 $code.=<<___;
1607 mov %rsp, %rbp # backup %rsp
1608 mov 240($arg4), %eax # rounds
1609 mov $arg1, $inp # backup arguments
1610 mov $arg2, $out
1611 mov $arg3, $len
1612 mov $arg4, $key
1613 mov $arg5, %rbx
1614 shr \$4, $len # bytes to blocks
1616 mov %eax, %edx # rounds
1617 shl \$7, %rax # 128 bytes per inner round key
1618 sub \$`128-32`, %rax # size of bit-sliced key schedule
1619 sub %rax, %rsp
1621 mov %rsp, %rax # pass key schedule
1622 mov $key, %rcx # pass key
1623 mov %edx, %r10d # pass rounds
1624 call _bsaes_key_convert
1625 pxor (%rsp),%xmm7 # fix up 0 round key
1626 movdqa %xmm6,(%rax) # save last round key
1627 movdqa %xmm7,(%rsp)
1629 movdqu (%rbx), @XMM[15] # load IV
1630 sub \$8,$len
1631 .Lcbc_dec_loop:
1632 movdqu 0x00($inp), @XMM[0] # load input
1633 movdqu 0x10($inp), @XMM[1]
1634 movdqu 0x20($inp), @XMM[2]
1635 movdqu 0x30($inp), @XMM[3]
1636 movdqu 0x40($inp), @XMM[4]
1637 movdqu 0x50($inp), @XMM[5]
1638 mov %rsp, %rax # pass key schedule
1639 movdqu 0x60($inp), @XMM[6]
1640 mov %edx,%r10d # pass rounds
1641 movdqu 0x70($inp), @XMM[7]
1642 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1644 call _bsaes_decrypt8
1646 pxor 0x20(%rbp), @XMM[0] # ^= IV
1647 movdqu 0x00($inp), @XMM[8] # re-load input
1648 movdqu 0x10($inp), @XMM[9]
1649 pxor @XMM[8], @XMM[1]
1650 movdqu 0x20($inp), @XMM[10]
1651 pxor @XMM[9], @XMM[6]
1652 movdqu 0x30($inp), @XMM[11]
1653 pxor @XMM[10], @XMM[4]
1654 movdqu 0x40($inp), @XMM[12]
1655 pxor @XMM[11], @XMM[2]
1656 movdqu 0x50($inp), @XMM[13]
1657 pxor @XMM[12], @XMM[7]
1658 movdqu 0x60($inp), @XMM[14]
1659 pxor @XMM[13], @XMM[3]
1660 movdqu 0x70($inp), @XMM[15] # IV
1661 pxor @XMM[14], @XMM[5]
1662 movdqu @XMM[0], 0x00($out) # write output
1663 lea 0x80($inp), $inp
1664 movdqu @XMM[1], 0x10($out)
1665 movdqu @XMM[6], 0x20($out)
1666 movdqu @XMM[4], 0x30($out)
1667 movdqu @XMM[2], 0x40($out)
1668 movdqu @XMM[7], 0x50($out)
1669 movdqu @XMM[3], 0x60($out)
1670 movdqu @XMM[5], 0x70($out)
1671 lea 0x80($out), $out
1672 sub \$8,$len
1673 jnc .Lcbc_dec_loop
1675 add \$8,$len
1676 jz .Lcbc_dec_done
1678 movdqu 0x00($inp), @XMM[0] # load input
1679 mov %rsp, %rax # pass key schedule
1680 mov %edx, %r10d # pass rounds
1681 cmp \$2,$len
1682 jb .Lcbc_dec_one
1683 movdqu 0x10($inp), @XMM[1]
1684 je .Lcbc_dec_two
1685 movdqu 0x20($inp), @XMM[2]
1686 cmp \$4,$len
1687 jb .Lcbc_dec_three
1688 movdqu 0x30($inp), @XMM[3]
1689 je .Lcbc_dec_four
1690 movdqu 0x40($inp), @XMM[4]
1691 cmp \$6,$len
1692 jb .Lcbc_dec_five
1693 movdqu 0x50($inp), @XMM[5]
1694 je .Lcbc_dec_six
1695 movdqu 0x60($inp), @XMM[6]
1696 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1697 call _bsaes_decrypt8
1698 pxor 0x20(%rbp), @XMM[0] # ^= IV
1699 movdqu 0x00($inp), @XMM[8] # re-load input
1700 movdqu 0x10($inp), @XMM[9]
1701 pxor @XMM[8], @XMM[1]
1702 movdqu 0x20($inp), @XMM[10]
1703 pxor @XMM[9], @XMM[6]
1704 movdqu 0x30($inp), @XMM[11]
1705 pxor @XMM[10], @XMM[4]
1706 movdqu 0x40($inp), @XMM[12]
1707 pxor @XMM[11], @XMM[2]
1708 movdqu 0x50($inp), @XMM[13]
1709 pxor @XMM[12], @XMM[7]
1710 movdqu 0x60($inp), @XMM[15] # IV
1711 pxor @XMM[13], @XMM[3]
1712 movdqu @XMM[0], 0x00($out) # write output
1713 movdqu @XMM[1], 0x10($out)
1714 movdqu @XMM[6], 0x20($out)
1715 movdqu @XMM[4], 0x30($out)
1716 movdqu @XMM[2], 0x40($out)
1717 movdqu @XMM[7], 0x50($out)
1718 movdqu @XMM[3], 0x60($out)
1719 jmp .Lcbc_dec_done
1720 .align 16
1721 .Lcbc_dec_six:
1722 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1723 call _bsaes_decrypt8
1724 pxor 0x20(%rbp), @XMM[0] # ^= IV
1725 movdqu 0x00($inp), @XMM[8] # re-load input
1726 movdqu 0x10($inp), @XMM[9]
1727 pxor @XMM[8], @XMM[1]
1728 movdqu 0x20($inp), @XMM[10]
1729 pxor @XMM[9], @XMM[6]
1730 movdqu 0x30($inp), @XMM[11]
1731 pxor @XMM[10], @XMM[4]
1732 movdqu 0x40($inp), @XMM[12]
1733 pxor @XMM[11], @XMM[2]
1734 movdqu 0x50($inp), @XMM[15] # IV
1735 pxor @XMM[12], @XMM[7]
1736 movdqu @XMM[0], 0x00($out) # write output
1737 movdqu @XMM[1], 0x10($out)
1738 movdqu @XMM[6], 0x20($out)
1739 movdqu @XMM[4], 0x30($out)
1740 movdqu @XMM[2], 0x40($out)
1741 movdqu @XMM[7], 0x50($out)
1742 jmp .Lcbc_dec_done
1743 .align 16
1744 .Lcbc_dec_five:
1745 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1746 call _bsaes_decrypt8
1747 pxor 0x20(%rbp), @XMM[0] # ^= IV
1748 movdqu 0x00($inp), @XMM[8] # re-load input
1749 movdqu 0x10($inp), @XMM[9]
1750 pxor @XMM[8], @XMM[1]
1751 movdqu 0x20($inp), @XMM[10]
1752 pxor @XMM[9], @XMM[6]
1753 movdqu 0x30($inp), @XMM[11]
1754 pxor @XMM[10], @XMM[4]
1755 movdqu 0x40($inp), @XMM[15] # IV
1756 pxor @XMM[11], @XMM[2]
1757 movdqu @XMM[0], 0x00($out) # write output
1758 movdqu @XMM[1], 0x10($out)
1759 movdqu @XMM[6], 0x20($out)
1760 movdqu @XMM[4], 0x30($out)
1761 movdqu @XMM[2], 0x40($out)
1762 jmp .Lcbc_dec_done
1763 .align 16
1764 .Lcbc_dec_four:
1765 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1766 call _bsaes_decrypt8
1767 pxor 0x20(%rbp), @XMM[0] # ^= IV
1768 movdqu 0x00($inp), @XMM[8] # re-load input
1769 movdqu 0x10($inp), @XMM[9]
1770 pxor @XMM[8], @XMM[1]
1771 movdqu 0x20($inp), @XMM[10]
1772 pxor @XMM[9], @XMM[6]
1773 movdqu 0x30($inp), @XMM[15] # IV
1774 pxor @XMM[10], @XMM[4]
1775 movdqu @XMM[0], 0x00($out) # write output
1776 movdqu @XMM[1], 0x10($out)
1777 movdqu @XMM[6], 0x20($out)
1778 movdqu @XMM[4], 0x30($out)
1779 jmp .Lcbc_dec_done
1780 .align 16
1781 .Lcbc_dec_three:
1782 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1783 call _bsaes_decrypt8
1784 pxor 0x20(%rbp), @XMM[0] # ^= IV
1785 movdqu 0x00($inp), @XMM[8] # re-load input
1786 movdqu 0x10($inp), @XMM[9]
1787 pxor @XMM[8], @XMM[1]
1788 movdqu 0x20($inp), @XMM[15] # IV
1789 pxor @XMM[9], @XMM[6]
1790 movdqu @XMM[0], 0x00($out) # write output
1791 movdqu @XMM[1], 0x10($out)
1792 movdqu @XMM[6], 0x20($out)
1793 jmp .Lcbc_dec_done
1794 .align 16
1795 .Lcbc_dec_two:
1796 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1797 call _bsaes_decrypt8
1798 pxor 0x20(%rbp), @XMM[0] # ^= IV
1799 movdqu 0x00($inp), @XMM[8] # re-load input
1800 movdqu 0x10($inp), @XMM[15] # IV
1801 pxor @XMM[8], @XMM[1]
1802 movdqu @XMM[0], 0x00($out) # write output
1803 movdqu @XMM[1], 0x10($out)
1804 jmp .Lcbc_dec_done
1805 .align 16
1806 .Lcbc_dec_one:
1807 lea ($inp), $arg1
1808 lea 0x20(%rbp), $arg2 # buffer output
1809 lea ($key), $arg3
1810 call asm_AES_decrypt # doesn't touch %xmm
1811 pxor 0x20(%rbp), @XMM[15] # ^= IV
1812 movdqu @XMM[15], ($out) # write output
1813 movdqa @XMM[0], @XMM[15] # IV
1815 .Lcbc_dec_done:
1816 movdqu @XMM[15], (%rbx) # return IV
1817 lea (%rsp), %rax
1818 pxor %xmm0, %xmm0
1819 .Lcbc_dec_bzero: # wipe key schedule [if any]
1820 movdqa %xmm0, 0x00(%rax)
1821 movdqa %xmm0, 0x10(%rax)
1822 lea 0x20(%rax), %rax
1823 cmp %rax, %rbp
1824 ja .Lcbc_dec_bzero
1826 lea (%rbp),%rsp # restore %rsp
1828 $code.=<<___ if ($win64);
1829 movaps 0x40(%rbp), %xmm6
1830 movaps 0x50(%rbp), %xmm7
1831 movaps 0x60(%rbp), %xmm8
1832 movaps 0x70(%rbp), %xmm9
1833 movaps 0x80(%rbp), %xmm10
1834 movaps 0x90(%rbp), %xmm11
1835 movaps 0xa0(%rbp), %xmm12
1836 movaps 0xb0(%rbp), %xmm13
1837 movaps 0xc0(%rbp), %xmm14
1838 movaps 0xd0(%rbp), %xmm15
1839 lea 0xa0(%rbp), %rsp
1841 $code.=<<___;
1842 mov 0x48(%rsp), %r15
1843 mov 0x50(%rsp), %r14
1844 mov 0x58(%rsp), %r13
1845 mov 0x60(%rsp), %r12
1846 mov 0x68(%rsp), %rbx
1847 mov 0x70(%rsp), %rax
1848 lea 0x78(%rsp), %rsp
1849 mov %rax, %rbp
1850 .Lcbc_dec_epilogue:
1852 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1854 .globl bsaes_ctr32_encrypt_blocks
1855 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1856 .align 16
1857 bsaes_ctr32_encrypt_blocks:
1858 mov %rsp, %rax
1859 .Lctr_enc_prologue:
1860 push %rbp
1861 push %rbx
1862 push %r12
1863 push %r13
1864 push %r14
1865 push %r15
1866 lea -0x48(%rsp), %rsp
1868 $code.=<<___ if ($win64);
1869 mov 0xa0(%rsp),$arg5 # pull ivp
1870 lea -0xa0(%rsp), %rsp
1871 movaps %xmm6, 0x40(%rsp)
1872 movaps %xmm7, 0x50(%rsp)
1873 movaps %xmm8, 0x60(%rsp)
1874 movaps %xmm9, 0x70(%rsp)
1875 movaps %xmm10, 0x80(%rsp)
1876 movaps %xmm11, 0x90(%rsp)
1877 movaps %xmm12, 0xa0(%rsp)
1878 movaps %xmm13, 0xb0(%rsp)
1879 movaps %xmm14, 0xc0(%rsp)
1880 movaps %xmm15, 0xd0(%rsp)
1881 .Lctr_enc_body:
1883 $code.=<<___;
1884 mov %rsp, %rbp # backup %rsp
1885 movdqu ($arg5), %xmm0 # load counter
1886 mov 240($arg4), %eax # rounds
1887 mov $arg1, $inp # backup arguments
1888 mov $arg2, $out
1889 mov $arg3, $len
1890 mov $arg4, $key
1891 movdqa %xmm0, 0x20(%rbp) # copy counter
1892 cmp \$8, $arg3
1893 jb .Lctr_enc_short
1895 mov %eax, %ebx # rounds
1896 shl \$7, %rax # 128 bytes per inner round key
1897 sub \$`128-32`, %rax # size of bit-sliced key schedule
1898 sub %rax, %rsp
1900 mov %rsp, %rax # pass key schedule
1901 mov $key, %rcx # pass key
1902 mov %ebx, %r10d # pass rounds
1903 call _bsaes_key_convert
1904 pxor %xmm6,%xmm7 # fix up last round key
1905 movdqa %xmm7,(%rax) # save last round key
1907 movdqa (%rsp), @XMM[9] # load round0 key
1908 lea .LADD1(%rip), %r11
1909 movdqa 0x20(%rbp), @XMM[0] # counter copy
1910 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1911 pshufb @XMM[8], @XMM[9] # byte swap upper part
1912 pshufb @XMM[8], @XMM[0]
1913 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1914 jmp .Lctr_enc_loop
1915 .align 16
1916 .Lctr_enc_loop:
1917 movdqa @XMM[0], 0x20(%rbp) # save counter
1918 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1919 movdqa @XMM[0], @XMM[2]
1920 paddd 0x00(%r11), @XMM[1] # .LADD1
1921 movdqa @XMM[0], @XMM[3]
1922 paddd 0x10(%r11), @XMM[2] # .LADD2
1923 movdqa @XMM[0], @XMM[4]
1924 paddd 0x20(%r11), @XMM[3] # .LADD3
1925 movdqa @XMM[0], @XMM[5]
1926 paddd 0x30(%r11), @XMM[4] # .LADD4
1927 movdqa @XMM[0], @XMM[6]
1928 paddd 0x40(%r11), @XMM[5] # .LADD5
1929 movdqa @XMM[0], @XMM[7]
1930 paddd 0x50(%r11), @XMM[6] # .LADD6
1931 paddd 0x60(%r11), @XMM[7] # .LADD7
1933 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1934 # to flip byte order in 32-bit counter
1935 movdqa (%rsp), @XMM[9] # round 0 key
1936 lea 0x10(%rsp), %rax # pass key schedule
1937 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1938 pxor @XMM[9], @XMM[0] # xor with round0 key
1939 pxor @XMM[9], @XMM[1]
1940 pshufb @XMM[8], @XMM[0]
1941 pxor @XMM[9], @XMM[2]
1942 pshufb @XMM[8], @XMM[1]
1943 pxor @XMM[9], @XMM[3]
1944 pshufb @XMM[8], @XMM[2]
1945 pxor @XMM[9], @XMM[4]
1946 pshufb @XMM[8], @XMM[3]
1947 pxor @XMM[9], @XMM[5]
1948 pshufb @XMM[8], @XMM[4]
1949 pxor @XMM[9], @XMM[6]
1950 pshufb @XMM[8], @XMM[5]
1951 pxor @XMM[9], @XMM[7]
1952 pshufb @XMM[8], @XMM[6]
1953 lea .LBS0(%rip), %r11 # constants table
1954 pshufb @XMM[8], @XMM[7]
1955 mov %ebx,%r10d # pass rounds
1957 call _bsaes_encrypt8_bitslice
1959 sub \$8,$len
1960 jc .Lctr_enc_loop_done
1962 movdqu 0x00($inp), @XMM[8] # load input
1963 movdqu 0x10($inp), @XMM[9]
1964 movdqu 0x20($inp), @XMM[10]
1965 movdqu 0x30($inp), @XMM[11]
1966 movdqu 0x40($inp), @XMM[12]
1967 movdqu 0x50($inp), @XMM[13]
1968 movdqu 0x60($inp), @XMM[14]
1969 movdqu 0x70($inp), @XMM[15]
1970 lea 0x80($inp),$inp
1971 pxor @XMM[0], @XMM[8]
1972 movdqa 0x20(%rbp), @XMM[0] # load counter
1973 pxor @XMM[9], @XMM[1]
1974 movdqu @XMM[8], 0x00($out) # write output
1975 pxor @XMM[10], @XMM[4]
1976 movdqu @XMM[1], 0x10($out)
1977 pxor @XMM[11], @XMM[6]
1978 movdqu @XMM[4], 0x20($out)
1979 pxor @XMM[12], @XMM[3]
1980 movdqu @XMM[6], 0x30($out)
1981 pxor @XMM[13], @XMM[7]
1982 movdqu @XMM[3], 0x40($out)
1983 pxor @XMM[14], @XMM[2]
1984 movdqu @XMM[7], 0x50($out)
1985 pxor @XMM[15], @XMM[5]
1986 movdqu @XMM[2], 0x60($out)
1987 lea .LADD1(%rip), %r11
1988 movdqu @XMM[5], 0x70($out)
1989 lea 0x80($out), $out
1990 paddd 0x70(%r11), @XMM[0] # .LADD8
1991 jnz .Lctr_enc_loop
1993 jmp .Lctr_enc_done
1994 .align 16
1995 .Lctr_enc_loop_done:
1996 add \$8, $len
1997 movdqu 0x00($inp), @XMM[8] # load input
1998 pxor @XMM[8], @XMM[0]
1999 movdqu @XMM[0], 0x00($out) # write output
2000 cmp \$2,$len
2001 jb .Lctr_enc_done
2002 movdqu 0x10($inp), @XMM[9]
2003 pxor @XMM[9], @XMM[1]
2004 movdqu @XMM[1], 0x10($out)
2005 je .Lctr_enc_done
2006 movdqu 0x20($inp), @XMM[10]
2007 pxor @XMM[10], @XMM[4]
2008 movdqu @XMM[4], 0x20($out)
2009 cmp \$4,$len
2010 jb .Lctr_enc_done
2011 movdqu 0x30($inp), @XMM[11]
2012 pxor @XMM[11], @XMM[6]
2013 movdqu @XMM[6], 0x30($out)
2014 je .Lctr_enc_done
2015 movdqu 0x40($inp), @XMM[12]
2016 pxor @XMM[12], @XMM[3]
2017 movdqu @XMM[3], 0x40($out)
2018 cmp \$6,$len
2019 jb .Lctr_enc_done
2020 movdqu 0x50($inp), @XMM[13]
2021 pxor @XMM[13], @XMM[7]
2022 movdqu @XMM[7], 0x50($out)
2023 je .Lctr_enc_done
2024 movdqu 0x60($inp), @XMM[14]
2025 pxor @XMM[14], @XMM[2]
2026 movdqu @XMM[2], 0x60($out)
2027 jmp .Lctr_enc_done
2029 .align 16
2030 .Lctr_enc_short:
2031 lea 0x20(%rbp), $arg1
2032 lea 0x30(%rbp), $arg2
2033 lea ($key), $arg3
2034 call asm_AES_encrypt
2035 movdqu ($inp), @XMM[1]
2036 lea 16($inp), $inp
2037 mov 0x2c(%rbp), %eax # load 32-bit counter
2038 bswap %eax
2039 pxor 0x30(%rbp), @XMM[1]
2040 inc %eax # increment
2041 movdqu @XMM[1], ($out)
2042 bswap %eax
2043 lea 16($out), $out
2044 mov %eax, 0x2c(%rsp) # save 32-bit counter
2045 dec $len
2046 jnz .Lctr_enc_short
2048 .Lctr_enc_done:
2049 lea (%rsp), %rax
2050 pxor %xmm0, %xmm0
2051 .Lctr_enc_bzero: # wipe key schedule [if any]
2052 movdqa %xmm0, 0x00(%rax)
2053 movdqa %xmm0, 0x10(%rax)
2054 lea 0x20(%rax), %rax
2055 cmp %rax, %rbp
2056 ja .Lctr_enc_bzero
2058 lea (%rbp),%rsp # restore %rsp
2060 $code.=<<___ if ($win64);
2061 movaps 0x40(%rbp), %xmm6
2062 movaps 0x50(%rbp), %xmm7
2063 movaps 0x60(%rbp), %xmm8
2064 movaps 0x70(%rbp), %xmm9
2065 movaps 0x80(%rbp), %xmm10
2066 movaps 0x90(%rbp), %xmm11
2067 movaps 0xa0(%rbp), %xmm12
2068 movaps 0xb0(%rbp), %xmm13
2069 movaps 0xc0(%rbp), %xmm14
2070 movaps 0xd0(%rbp), %xmm15
2071 lea 0xa0(%rbp), %rsp
2073 $code.=<<___;
2074 mov 0x48(%rsp), %r15
2075 mov 0x50(%rsp), %r14
2076 mov 0x58(%rsp), %r13
2077 mov 0x60(%rsp), %r12
2078 mov 0x68(%rsp), %rbx
2079 mov 0x70(%rsp), %rax
2080 lea 0x78(%rsp), %rsp
2081 mov %rax, %rbp
2082 .Lctr_enc_epilogue:
2084 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2086 ######################################################################
2087 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2088 # const AES_KEY *key1, const AES_KEY *key2,
2089 # const unsigned char iv[16]);
2091 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2092 $arg6=~s/d$//;
2094 $code.=<<___;
2095 .globl bsaes_xts_encrypt
2096 .type bsaes_xts_encrypt,\@abi-omnipotent
2097 .align 16
2098 bsaes_xts_encrypt:
2099 mov %rsp, %rax
2100 .Lxts_enc_prologue:
2101 push %rbp
2102 push %rbx
2103 push %r12
2104 push %r13
2105 push %r14
2106 push %r15
2107 lea -0x48(%rsp), %rsp
2109 $code.=<<___ if ($win64);
2110 mov 0xa0(%rsp),$arg5 # pull key2
2111 mov 0xa8(%rsp),$arg6 # pull ivp
2112 lea -0xa0(%rsp), %rsp
2113 movaps %xmm6, 0x40(%rsp)
2114 movaps %xmm7, 0x50(%rsp)
2115 movaps %xmm8, 0x60(%rsp)
2116 movaps %xmm9, 0x70(%rsp)
2117 movaps %xmm10, 0x80(%rsp)
2118 movaps %xmm11, 0x90(%rsp)
2119 movaps %xmm12, 0xa0(%rsp)
2120 movaps %xmm13, 0xb0(%rsp)
2121 movaps %xmm14, 0xc0(%rsp)
2122 movaps %xmm15, 0xd0(%rsp)
2123 .Lxts_enc_body:
2125 $code.=<<___;
2126 mov %rsp, %rbp # backup %rsp
2127 mov $arg1, $inp # backup arguments
2128 mov $arg2, $out
2129 mov $arg3, $len
2130 mov $arg4, $key
2132 lea ($arg6), $arg1
2133 lea 0x20(%rbp), $arg2
2134 lea ($arg5), $arg3
2135 call asm_AES_encrypt # generate initial tweak
2137 mov 240($key), %eax # rounds
2138 mov $len, %rbx # backup $len
2140 mov %eax, %edx # rounds
2141 shl \$7, %rax # 128 bytes per inner round key
2142 sub \$`128-32`, %rax # size of bit-sliced key schedule
2143 sub %rax, %rsp
2145 mov %rsp, %rax # pass key schedule
2146 mov $key, %rcx # pass key
2147 mov %edx, %r10d # pass rounds
2148 call _bsaes_key_convert
2149 pxor %xmm6, %xmm7 # fix up last round key
2150 movdqa %xmm7, (%rax) # save last round key
2152 and \$-16, $len
2153 sub \$0x80, %rsp # place for tweak[8]
2154 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2156 pxor $twtmp, $twtmp
2157 movdqa .Lxts_magic(%rip), $twmask
2158 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2160 sub \$0x80, $len
2161 jc .Lxts_enc_short
2162 jmp .Lxts_enc_loop
2164 .align 16
2165 .Lxts_enc_loop:
2167 for ($i=0;$i<7;$i++) {
2168 $code.=<<___;
2169 pshufd \$0x13, $twtmp, $twres
2170 pxor $twtmp, $twtmp
2171 movdqa @XMM[7], @XMM[$i]
2172 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2173 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2174 pand $twmask, $twres # isolate carry and residue
2175 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2176 pxor $twres, @XMM[7]
2178 $code.=<<___ if ($i>=1);
2179 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2181 $code.=<<___ if ($i>=2);
2182 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2185 $code.=<<___;
2186 movdqu 0x60($inp), @XMM[8+6]
2187 pxor @XMM[8+5], @XMM[5]
2188 movdqu 0x70($inp), @XMM[8+7]
2189 lea 0x80($inp), $inp
2190 movdqa @XMM[7], 0x70(%rsp)
2191 pxor @XMM[8+6], @XMM[6]
2192 lea 0x80(%rsp), %rax # pass key schedule
2193 pxor @XMM[8+7], @XMM[7]
2194 mov %edx, %r10d # pass rounds
2196 call _bsaes_encrypt8
2198 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2199 pxor 0x10(%rsp), @XMM[1]
2200 movdqu @XMM[0], 0x00($out) # write output
2201 pxor 0x20(%rsp), @XMM[4]
2202 movdqu @XMM[1], 0x10($out)
2203 pxor 0x30(%rsp), @XMM[6]
2204 movdqu @XMM[4], 0x20($out)
2205 pxor 0x40(%rsp), @XMM[3]
2206 movdqu @XMM[6], 0x30($out)
2207 pxor 0x50(%rsp), @XMM[7]
2208 movdqu @XMM[3], 0x40($out)
2209 pxor 0x60(%rsp), @XMM[2]
2210 movdqu @XMM[7], 0x50($out)
2211 pxor 0x70(%rsp), @XMM[5]
2212 movdqu @XMM[2], 0x60($out)
2213 movdqu @XMM[5], 0x70($out)
2214 lea 0x80($out), $out
2216 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2217 pxor $twtmp, $twtmp
2218 movdqa .Lxts_magic(%rip), $twmask
2219 pcmpgtd @XMM[7], $twtmp
2220 pshufd \$0x13, $twtmp, $twres
2221 pxor $twtmp, $twtmp
2222 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2223 pand $twmask, $twres # isolate carry and residue
2224 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2225 pxor $twres, @XMM[7]
2227 sub \$0x80,$len
2228 jnc .Lxts_enc_loop
2230 .Lxts_enc_short:
2231 add \$0x80, $len
2232 jz .Lxts_enc_done
2234 for ($i=0;$i<7;$i++) {
2235 $code.=<<___;
2236 pshufd \$0x13, $twtmp, $twres
2237 pxor $twtmp, $twtmp
2238 movdqa @XMM[7], @XMM[$i]
2239 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2240 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2241 pand $twmask, $twres # isolate carry and residue
2242 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2243 pxor $twres, @XMM[7]
2245 $code.=<<___ if ($i>=1);
2246 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2247 cmp \$`0x10*$i`,$len
2248 je .Lxts_enc_$i
2250 $code.=<<___ if ($i>=2);
2251 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2254 $code.=<<___;
2255 movdqu 0x60($inp), @XMM[8+6]
2256 pxor @XMM[8+5], @XMM[5]
2257 movdqa @XMM[7], 0x70(%rsp)
2258 lea 0x70($inp), $inp
2259 pxor @XMM[8+6], @XMM[6]
2260 lea 0x80(%rsp), %rax # pass key schedule
2261 mov %edx, %r10d # pass rounds
2263 call _bsaes_encrypt8
2265 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2266 pxor 0x10(%rsp), @XMM[1]
2267 movdqu @XMM[0], 0x00($out) # write output
2268 pxor 0x20(%rsp), @XMM[4]
2269 movdqu @XMM[1], 0x10($out)
2270 pxor 0x30(%rsp), @XMM[6]
2271 movdqu @XMM[4], 0x20($out)
2272 pxor 0x40(%rsp), @XMM[3]
2273 movdqu @XMM[6], 0x30($out)
2274 pxor 0x50(%rsp), @XMM[7]
2275 movdqu @XMM[3], 0x40($out)
2276 pxor 0x60(%rsp), @XMM[2]
2277 movdqu @XMM[7], 0x50($out)
2278 movdqu @XMM[2], 0x60($out)
2279 lea 0x70($out), $out
2281 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2282 jmp .Lxts_enc_done
2283 .align 16
2284 .Lxts_enc_6:
2285 pxor @XMM[8+4], @XMM[4]
2286 lea 0x60($inp), $inp
2287 pxor @XMM[8+5], @XMM[5]
2288 lea 0x80(%rsp), %rax # pass key schedule
2289 mov %edx, %r10d # pass rounds
2291 call _bsaes_encrypt8
2293 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2294 pxor 0x10(%rsp), @XMM[1]
2295 movdqu @XMM[0], 0x00($out) # write output
2296 pxor 0x20(%rsp), @XMM[4]
2297 movdqu @XMM[1], 0x10($out)
2298 pxor 0x30(%rsp), @XMM[6]
2299 movdqu @XMM[4], 0x20($out)
2300 pxor 0x40(%rsp), @XMM[3]
2301 movdqu @XMM[6], 0x30($out)
2302 pxor 0x50(%rsp), @XMM[7]
2303 movdqu @XMM[3], 0x40($out)
2304 movdqu @XMM[7], 0x50($out)
2305 lea 0x60($out), $out
2307 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2308 jmp .Lxts_enc_done
2309 .align 16
2310 .Lxts_enc_5:
2311 pxor @XMM[8+3], @XMM[3]
2312 lea 0x50($inp), $inp
2313 pxor @XMM[8+4], @XMM[4]
2314 lea 0x80(%rsp), %rax # pass key schedule
2315 mov %edx, %r10d # pass rounds
2317 call _bsaes_encrypt8
2319 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2320 pxor 0x10(%rsp), @XMM[1]
2321 movdqu @XMM[0], 0x00($out) # write output
2322 pxor 0x20(%rsp), @XMM[4]
2323 movdqu @XMM[1], 0x10($out)
2324 pxor 0x30(%rsp), @XMM[6]
2325 movdqu @XMM[4], 0x20($out)
2326 pxor 0x40(%rsp), @XMM[3]
2327 movdqu @XMM[6], 0x30($out)
2328 movdqu @XMM[3], 0x40($out)
2329 lea 0x50($out), $out
2331 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2332 jmp .Lxts_enc_done
2333 .align 16
2334 .Lxts_enc_4:
2335 pxor @XMM[8+2], @XMM[2]
2336 lea 0x40($inp), $inp
2337 pxor @XMM[8+3], @XMM[3]
2338 lea 0x80(%rsp), %rax # pass key schedule
2339 mov %edx, %r10d # pass rounds
2341 call _bsaes_encrypt8
2343 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2344 pxor 0x10(%rsp), @XMM[1]
2345 movdqu @XMM[0], 0x00($out) # write output
2346 pxor 0x20(%rsp), @XMM[4]
2347 movdqu @XMM[1], 0x10($out)
2348 pxor 0x30(%rsp), @XMM[6]
2349 movdqu @XMM[4], 0x20($out)
2350 movdqu @XMM[6], 0x30($out)
2351 lea 0x40($out), $out
2353 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2354 jmp .Lxts_enc_done
2355 .align 16
2356 .Lxts_enc_3:
2357 pxor @XMM[8+1], @XMM[1]
2358 lea 0x30($inp), $inp
2359 pxor @XMM[8+2], @XMM[2]
2360 lea 0x80(%rsp), %rax # pass key schedule
2361 mov %edx, %r10d # pass rounds
2363 call _bsaes_encrypt8
2365 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2366 pxor 0x10(%rsp), @XMM[1]
2367 movdqu @XMM[0], 0x00($out) # write output
2368 pxor 0x20(%rsp), @XMM[4]
2369 movdqu @XMM[1], 0x10($out)
2370 movdqu @XMM[4], 0x20($out)
2371 lea 0x30($out), $out
2373 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2374 jmp .Lxts_enc_done
2375 .align 16
2376 .Lxts_enc_2:
2377 pxor @XMM[8+0], @XMM[0]
2378 lea 0x20($inp), $inp
2379 pxor @XMM[8+1], @XMM[1]
2380 lea 0x80(%rsp), %rax # pass key schedule
2381 mov %edx, %r10d # pass rounds
2383 call _bsaes_encrypt8
2385 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2386 pxor 0x10(%rsp), @XMM[1]
2387 movdqu @XMM[0], 0x00($out) # write output
2388 movdqu @XMM[1], 0x10($out)
2389 lea 0x20($out), $out
2391 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2392 jmp .Lxts_enc_done
2393 .align 16
2394 .Lxts_enc_1:
2395 pxor @XMM[0], @XMM[8]
2396 lea 0x10($inp), $inp
2397 movdqa @XMM[8], 0x20(%rbp)
2398 lea 0x20(%rbp), $arg1
2399 lea 0x20(%rbp), $arg2
2400 lea ($key), $arg3
2401 call asm_AES_encrypt # doesn't touch %xmm
2402 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2403 #pxor @XMM[8], @XMM[0]
2404 #lea 0x80(%rsp), %rax # pass key schedule
2405 #mov %edx, %r10d # pass rounds
2406 #call _bsaes_encrypt8
2407 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2408 movdqu @XMM[0], 0x00($out) # write output
2409 lea 0x10($out), $out
2411 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2413 .Lxts_enc_done:
2414 and \$15, %ebx
2415 jz .Lxts_enc_ret
2416 mov $out, %rdx
2418 .Lxts_enc_steal:
2419 movzb ($inp), %eax
2420 movzb -16(%rdx), %ecx
2421 lea 1($inp), $inp
2422 mov %al, -16(%rdx)
2423 mov %cl, 0(%rdx)
2424 lea 1(%rdx), %rdx
2425 sub \$1,%ebx
2426 jnz .Lxts_enc_steal
2428 movdqu -16($out), @XMM[0]
2429 lea 0x20(%rbp), $arg1
2430 pxor @XMM[7], @XMM[0]
2431 lea 0x20(%rbp), $arg2
2432 movdqa @XMM[0], 0x20(%rbp)
2433 lea ($key), $arg3
2434 call asm_AES_encrypt # doesn't touch %xmm
2435 pxor 0x20(%rbp), @XMM[7]
2436 movdqu @XMM[7], -16($out)
2438 .Lxts_enc_ret:
2439 lea (%rsp), %rax
2440 pxor %xmm0, %xmm0
2441 .Lxts_enc_bzero: # wipe key schedule [if any]
2442 movdqa %xmm0, 0x00(%rax)
2443 movdqa %xmm0, 0x10(%rax)
2444 lea 0x20(%rax), %rax
2445 cmp %rax, %rbp
2446 ja .Lxts_enc_bzero
2448 lea (%rbp),%rsp # restore %rsp
2450 $code.=<<___ if ($win64);
2451 movaps 0x40(%rbp), %xmm6
2452 movaps 0x50(%rbp), %xmm7
2453 movaps 0x60(%rbp), %xmm8
2454 movaps 0x70(%rbp), %xmm9
2455 movaps 0x80(%rbp), %xmm10
2456 movaps 0x90(%rbp), %xmm11
2457 movaps 0xa0(%rbp), %xmm12
2458 movaps 0xb0(%rbp), %xmm13
2459 movaps 0xc0(%rbp), %xmm14
2460 movaps 0xd0(%rbp), %xmm15
2461 lea 0xa0(%rbp), %rsp
2463 $code.=<<___;
2464 mov 0x48(%rsp), %r15
2465 mov 0x50(%rsp), %r14
2466 mov 0x58(%rsp), %r13
2467 mov 0x60(%rsp), %r12
2468 mov 0x68(%rsp), %rbx
2469 mov 0x70(%rsp), %rax
2470 lea 0x78(%rsp), %rsp
2471 mov %rax, %rbp
2472 .Lxts_enc_epilogue:
2474 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2476 .globl bsaes_xts_decrypt
2477 .type bsaes_xts_decrypt,\@abi-omnipotent
2478 .align 16
2479 bsaes_xts_decrypt:
2480 mov %rsp, %rax
2481 .Lxts_dec_prologue:
2482 push %rbp
2483 push %rbx
2484 push %r12
2485 push %r13
2486 push %r14
2487 push %r15
2488 lea -0x48(%rsp), %rsp
2490 $code.=<<___ if ($win64);
2491 mov 0xa0(%rsp),$arg5 # pull key2
2492 mov 0xa8(%rsp),$arg6 # pull ivp
2493 lea -0xa0(%rsp), %rsp
2494 movaps %xmm6, 0x40(%rsp)
2495 movaps %xmm7, 0x50(%rsp)
2496 movaps %xmm8, 0x60(%rsp)
2497 movaps %xmm9, 0x70(%rsp)
2498 movaps %xmm10, 0x80(%rsp)
2499 movaps %xmm11, 0x90(%rsp)
2500 movaps %xmm12, 0xa0(%rsp)
2501 movaps %xmm13, 0xb0(%rsp)
2502 movaps %xmm14, 0xc0(%rsp)
2503 movaps %xmm15, 0xd0(%rsp)
2504 .Lxts_dec_body:
2506 $code.=<<___;
2507 mov %rsp, %rbp # backup %rsp
2508 mov $arg1, $inp # backup arguments
2509 mov $arg2, $out
2510 mov $arg3, $len
2511 mov $arg4, $key
2513 lea ($arg6), $arg1
2514 lea 0x20(%rbp), $arg2
2515 lea ($arg5), $arg3
2516 call asm_AES_encrypt # generate initial tweak
2518 mov 240($key), %eax # rounds
2519 mov $len, %rbx # backup $len
2521 mov %eax, %edx # rounds
2522 shl \$7, %rax # 128 bytes per inner round key
2523 sub \$`128-32`, %rax # size of bit-sliced key schedule
2524 sub %rax, %rsp
2526 mov %rsp, %rax # pass key schedule
2527 mov $key, %rcx # pass key
2528 mov %edx, %r10d # pass rounds
2529 call _bsaes_key_convert
2530 pxor (%rsp), %xmm7 # fix up round 0 key
2531 movdqa %xmm6, (%rax) # save last round key
2532 movdqa %xmm7, (%rsp)
2534 xor %eax, %eax # if ($len%16) len-=16;
2535 and \$-16, $len
2536 test \$15, %ebx
2537 setnz %al
2538 shl \$4, %rax
2539 sub %rax, $len
2541 sub \$0x80, %rsp # place for tweak[8]
2542 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2544 pxor $twtmp, $twtmp
2545 movdqa .Lxts_magic(%rip), $twmask
2546 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2548 sub \$0x80, $len
2549 jc .Lxts_dec_short
2550 jmp .Lxts_dec_loop
2552 .align 16
2553 .Lxts_dec_loop:
2555 for ($i=0;$i<7;$i++) {
2556 $code.=<<___;
2557 pshufd \$0x13, $twtmp, $twres
2558 pxor $twtmp, $twtmp
2559 movdqa @XMM[7], @XMM[$i]
2560 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2561 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2562 pand $twmask, $twres # isolate carry and residue
2563 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2564 pxor $twres, @XMM[7]
2566 $code.=<<___ if ($i>=1);
2567 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2569 $code.=<<___ if ($i>=2);
2570 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2573 $code.=<<___;
2574 movdqu 0x60($inp), @XMM[8+6]
2575 pxor @XMM[8+5], @XMM[5]
2576 movdqu 0x70($inp), @XMM[8+7]
2577 lea 0x80($inp), $inp
2578 movdqa @XMM[7], 0x70(%rsp)
2579 pxor @XMM[8+6], @XMM[6]
2580 lea 0x80(%rsp), %rax # pass key schedule
2581 pxor @XMM[8+7], @XMM[7]
2582 mov %edx, %r10d # pass rounds
2584 call _bsaes_decrypt8
2586 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2587 pxor 0x10(%rsp), @XMM[1]
2588 movdqu @XMM[0], 0x00($out) # write output
2589 pxor 0x20(%rsp), @XMM[6]
2590 movdqu @XMM[1], 0x10($out)
2591 pxor 0x30(%rsp), @XMM[4]
2592 movdqu @XMM[6], 0x20($out)
2593 pxor 0x40(%rsp), @XMM[2]
2594 movdqu @XMM[4], 0x30($out)
2595 pxor 0x50(%rsp), @XMM[7]
2596 movdqu @XMM[2], 0x40($out)
2597 pxor 0x60(%rsp), @XMM[3]
2598 movdqu @XMM[7], 0x50($out)
2599 pxor 0x70(%rsp), @XMM[5]
2600 movdqu @XMM[3], 0x60($out)
2601 movdqu @XMM[5], 0x70($out)
2602 lea 0x80($out), $out
2604 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2605 pxor $twtmp, $twtmp
2606 movdqa .Lxts_magic(%rip), $twmask
2607 pcmpgtd @XMM[7], $twtmp
2608 pshufd \$0x13, $twtmp, $twres
2609 pxor $twtmp, $twtmp
2610 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2611 pand $twmask, $twres # isolate carry and residue
2612 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2613 pxor $twres, @XMM[7]
2615 sub \$0x80,$len
2616 jnc .Lxts_dec_loop
2618 .Lxts_dec_short:
2619 add \$0x80, $len
2620 jz .Lxts_dec_done
2622 for ($i=0;$i<7;$i++) {
2623 $code.=<<___;
2624 pshufd \$0x13, $twtmp, $twres
2625 pxor $twtmp, $twtmp
2626 movdqa @XMM[7], @XMM[$i]
2627 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2628 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2629 pand $twmask, $twres # isolate carry and residue
2630 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2631 pxor $twres, @XMM[7]
2633 $code.=<<___ if ($i>=1);
2634 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2635 cmp \$`0x10*$i`,$len
2636 je .Lxts_dec_$i
2638 $code.=<<___ if ($i>=2);
2639 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2642 $code.=<<___;
2643 movdqu 0x60($inp), @XMM[8+6]
2644 pxor @XMM[8+5], @XMM[5]
2645 movdqa @XMM[7], 0x70(%rsp)
2646 lea 0x70($inp), $inp
2647 pxor @XMM[8+6], @XMM[6]
2648 lea 0x80(%rsp), %rax # pass key schedule
2649 mov %edx, %r10d # pass rounds
2651 call _bsaes_decrypt8
2653 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2654 pxor 0x10(%rsp), @XMM[1]
2655 movdqu @XMM[0], 0x00($out) # write output
2656 pxor 0x20(%rsp), @XMM[6]
2657 movdqu @XMM[1], 0x10($out)
2658 pxor 0x30(%rsp), @XMM[4]
2659 movdqu @XMM[6], 0x20($out)
2660 pxor 0x40(%rsp), @XMM[2]
2661 movdqu @XMM[4], 0x30($out)
2662 pxor 0x50(%rsp), @XMM[7]
2663 movdqu @XMM[2], 0x40($out)
2664 pxor 0x60(%rsp), @XMM[3]
2665 movdqu @XMM[7], 0x50($out)
2666 movdqu @XMM[3], 0x60($out)
2667 lea 0x70($out), $out
2669 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2670 jmp .Lxts_dec_done
2671 .align 16
2672 .Lxts_dec_6:
2673 pxor @XMM[8+4], @XMM[4]
2674 lea 0x60($inp), $inp
2675 pxor @XMM[8+5], @XMM[5]
2676 lea 0x80(%rsp), %rax # pass key schedule
2677 mov %edx, %r10d # pass rounds
2679 call _bsaes_decrypt8
2681 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2682 pxor 0x10(%rsp), @XMM[1]
2683 movdqu @XMM[0], 0x00($out) # write output
2684 pxor 0x20(%rsp), @XMM[6]
2685 movdqu @XMM[1], 0x10($out)
2686 pxor 0x30(%rsp), @XMM[4]
2687 movdqu @XMM[6], 0x20($out)
2688 pxor 0x40(%rsp), @XMM[2]
2689 movdqu @XMM[4], 0x30($out)
2690 pxor 0x50(%rsp), @XMM[7]
2691 movdqu @XMM[2], 0x40($out)
2692 movdqu @XMM[7], 0x50($out)
2693 lea 0x60($out), $out
2695 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2696 jmp .Lxts_dec_done
2697 .align 16
2698 .Lxts_dec_5:
2699 pxor @XMM[8+3], @XMM[3]
2700 lea 0x50($inp), $inp
2701 pxor @XMM[8+4], @XMM[4]
2702 lea 0x80(%rsp), %rax # pass key schedule
2703 mov %edx, %r10d # pass rounds
2705 call _bsaes_decrypt8
2707 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2708 pxor 0x10(%rsp), @XMM[1]
2709 movdqu @XMM[0], 0x00($out) # write output
2710 pxor 0x20(%rsp), @XMM[6]
2711 movdqu @XMM[1], 0x10($out)
2712 pxor 0x30(%rsp), @XMM[4]
2713 movdqu @XMM[6], 0x20($out)
2714 pxor 0x40(%rsp), @XMM[2]
2715 movdqu @XMM[4], 0x30($out)
2716 movdqu @XMM[2], 0x40($out)
2717 lea 0x50($out), $out
2719 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2720 jmp .Lxts_dec_done
2721 .align 16
2722 .Lxts_dec_4:
2723 pxor @XMM[8+2], @XMM[2]
2724 lea 0x40($inp), $inp
2725 pxor @XMM[8+3], @XMM[3]
2726 lea 0x80(%rsp), %rax # pass key schedule
2727 mov %edx, %r10d # pass rounds
2729 call _bsaes_decrypt8
2731 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2732 pxor 0x10(%rsp), @XMM[1]
2733 movdqu @XMM[0], 0x00($out) # write output
2734 pxor 0x20(%rsp), @XMM[6]
2735 movdqu @XMM[1], 0x10($out)
2736 pxor 0x30(%rsp), @XMM[4]
2737 movdqu @XMM[6], 0x20($out)
2738 movdqu @XMM[4], 0x30($out)
2739 lea 0x40($out), $out
2741 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2742 jmp .Lxts_dec_done
2743 .align 16
2744 .Lxts_dec_3:
2745 pxor @XMM[8+1], @XMM[1]
2746 lea 0x30($inp), $inp
2747 pxor @XMM[8+2], @XMM[2]
2748 lea 0x80(%rsp), %rax # pass key schedule
2749 mov %edx, %r10d # pass rounds
2751 call _bsaes_decrypt8
2753 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2754 pxor 0x10(%rsp), @XMM[1]
2755 movdqu @XMM[0], 0x00($out) # write output
2756 pxor 0x20(%rsp), @XMM[6]
2757 movdqu @XMM[1], 0x10($out)
2758 movdqu @XMM[6], 0x20($out)
2759 lea 0x30($out), $out
2761 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2762 jmp .Lxts_dec_done
2763 .align 16
2764 .Lxts_dec_2:
2765 pxor @XMM[8+0], @XMM[0]
2766 lea 0x20($inp), $inp
2767 pxor @XMM[8+1], @XMM[1]
2768 lea 0x80(%rsp), %rax # pass key schedule
2769 mov %edx, %r10d # pass rounds
2771 call _bsaes_decrypt8
2773 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2774 pxor 0x10(%rsp), @XMM[1]
2775 movdqu @XMM[0], 0x00($out) # write output
2776 movdqu @XMM[1], 0x10($out)
2777 lea 0x20($out), $out
2779 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2780 jmp .Lxts_dec_done
2781 .align 16
2782 .Lxts_dec_1:
2783 pxor @XMM[0], @XMM[8]
2784 lea 0x10($inp), $inp
2785 movdqa @XMM[8], 0x20(%rbp)
2786 lea 0x20(%rbp), $arg1
2787 lea 0x20(%rbp), $arg2
2788 lea ($key), $arg3
2789 call asm_AES_decrypt # doesn't touch %xmm
2790 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2791 #pxor @XMM[8], @XMM[0]
2792 #lea 0x80(%rsp), %rax # pass key schedule
2793 #mov %edx, %r10d # pass rounds
2794 #call _bsaes_decrypt8
2795 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2796 movdqu @XMM[0], 0x00($out) # write output
2797 lea 0x10($out), $out
2799 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2801 .Lxts_dec_done:
2802 and \$15, %ebx
2803 jz .Lxts_dec_ret
2805 pxor $twtmp, $twtmp
2806 movdqa .Lxts_magic(%rip), $twmask
2807 pcmpgtd @XMM[7], $twtmp
2808 pshufd \$0x13, $twtmp, $twres
2809 movdqa @XMM[7], @XMM[6]
2810 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2811 pand $twmask, $twres # isolate carry and residue
2812 movdqu ($inp), @XMM[0]
2813 pxor $twres, @XMM[7]
2815 lea 0x20(%rbp), $arg1
2816 pxor @XMM[7], @XMM[0]
2817 lea 0x20(%rbp), $arg2
2818 movdqa @XMM[0], 0x20(%rbp)
2819 lea ($key), $arg3
2820 call asm_AES_decrypt # doesn't touch %xmm
2821 pxor 0x20(%rbp), @XMM[7]
2822 mov $out, %rdx
2823 movdqu @XMM[7], ($out)
2825 .Lxts_dec_steal:
2826 movzb 16($inp), %eax
2827 movzb (%rdx), %ecx
2828 lea 1($inp), $inp
2829 mov %al, (%rdx)
2830 mov %cl, 16(%rdx)
2831 lea 1(%rdx), %rdx
2832 sub \$1,%ebx
2833 jnz .Lxts_dec_steal
2835 movdqu ($out), @XMM[0]
2836 lea 0x20(%rbp), $arg1
2837 pxor @XMM[6], @XMM[0]
2838 lea 0x20(%rbp), $arg2
2839 movdqa @XMM[0], 0x20(%rbp)
2840 lea ($key), $arg3
2841 call asm_AES_decrypt # doesn't touch %xmm
2842 pxor 0x20(%rbp), @XMM[6]
2843 movdqu @XMM[6], ($out)
2845 .Lxts_dec_ret:
2846 lea (%rsp), %rax
2847 pxor %xmm0, %xmm0
2848 .Lxts_dec_bzero: # wipe key schedule [if any]
2849 movdqa %xmm0, 0x00(%rax)
2850 movdqa %xmm0, 0x10(%rax)
2851 lea 0x20(%rax), %rax
2852 cmp %rax, %rbp
2853 ja .Lxts_dec_bzero
2855 lea (%rbp),%rsp # restore %rsp
2857 $code.=<<___ if ($win64);
2858 movaps 0x40(%rbp), %xmm6
2859 movaps 0x50(%rbp), %xmm7
2860 movaps 0x60(%rbp), %xmm8
2861 movaps 0x70(%rbp), %xmm9
2862 movaps 0x80(%rbp), %xmm10
2863 movaps 0x90(%rbp), %xmm11
2864 movaps 0xa0(%rbp), %xmm12
2865 movaps 0xb0(%rbp), %xmm13
2866 movaps 0xc0(%rbp), %xmm14
2867 movaps 0xd0(%rbp), %xmm15
2868 lea 0xa0(%rbp), %rsp
2870 $code.=<<___;
2871 mov 0x48(%rsp), %r15
2872 mov 0x50(%rsp), %r14
2873 mov 0x58(%rsp), %r13
2874 mov 0x60(%rsp), %r12
2875 mov 0x68(%rsp), %rbx
2876 mov 0x70(%rsp), %rax
2877 lea 0x78(%rsp), %rsp
2878 mov %rax, %rbp
2879 .Lxts_dec_epilogue:
2881 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2884 $code.=<<___;
2885 .type _bsaes_const,\@object
2886 .align 64
2887 _bsaes_const:
2888 .LM0ISR: # InvShiftRows constants
2889 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2890 .LISRM0:
2891 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2892 .LISR:
2893 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2894 .LBS0: # bit-slice constants
2895 .quad 0x5555555555555555, 0x5555555555555555
2896 .LBS1:
2897 .quad 0x3333333333333333, 0x3333333333333333
2898 .LBS2:
2899 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2900 .LSR: # shiftrows constants
2901 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2902 .LSRM0:
2903 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2904 .LM0SR:
2905 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2906 .LSWPUP: # byte-swap upper dword
2907 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2908 .LSWPUPM0SR:
2909 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2910 .LADD1: # counter increment constants
2911 .quad 0x0000000000000000, 0x0000000100000000
2912 .LADD2:
2913 .quad 0x0000000000000000, 0x0000000200000000
2914 .LADD3:
2915 .quad 0x0000000000000000, 0x0000000300000000
2916 .LADD4:
2917 .quad 0x0000000000000000, 0x0000000400000000
2918 .LADD5:
2919 .quad 0x0000000000000000, 0x0000000500000000
2920 .LADD6:
2921 .quad 0x0000000000000000, 0x0000000600000000
2922 .LADD7:
2923 .quad 0x0000000000000000, 0x0000000700000000
2924 .LADD8:
2925 .quad 0x0000000000000000, 0x0000000800000000
2926 .Lxts_magic:
2927 .long 0x87,0,1,0
2928 .Lmasks:
2929 .quad 0x0101010101010101, 0x0101010101010101
2930 .quad 0x0202020202020202, 0x0202020202020202
2931 .quad 0x0404040404040404, 0x0404040404040404
2932 .quad 0x0808080808080808, 0x0808080808080808
2933 .LM0:
2934 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2935 .L63:
2936 .quad 0x6363636363636363, 0x6363636363636363
2937 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2938 .align 64
2939 .size _bsaes_const,.-_bsaes_const
2942 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2943 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2944 if ($win64) {
2945 $rec="%rcx";
2946 $frame="%rdx";
2947 $context="%r8";
2948 $disp="%r9";
2950 $code.=<<___;
2951 .extern __imp_RtlVirtualUnwind
2952 .type se_handler,\@abi-omnipotent
2953 .align 16
2954 se_handler:
2955 push %rsi
2956 push %rdi
2957 push %rbx
2958 push %rbp
2959 push %r12
2960 push %r13
2961 push %r14
2962 push %r15
2963 pushfq
2964 sub \$64,%rsp
2966 mov 120($context),%rax # pull context->Rax
2967 mov 248($context),%rbx # pull context->Rip
2969 mov 8($disp),%rsi # disp->ImageBase
2970 mov 56($disp),%r11 # disp->HandlerData
2972 mov 0(%r11),%r10d # HandlerData[0]
2973 lea (%rsi,%r10),%r10 # prologue label
2974 cmp %r10,%rbx # context->Rip<prologue label
2975 jb .Lin_prologue
2977 mov 152($context),%rax # pull context->Rsp
2979 mov 4(%r11),%r10d # HandlerData[1]
2980 lea (%rsi,%r10),%r10 # epilogue label
2981 cmp %r10,%rbx # context->Rip>=epilogue label
2982 jae .Lin_prologue
2984 mov 160($context),%rax # pull context->Rbp
2986 lea 0x40(%rax),%rsi # %xmm save area
2987 lea 512($context),%rdi # &context.Xmm6
2988 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2989 .long 0xa548f3fc # cld; rep movsq
2990 lea 0xa0(%rax),%rax # adjust stack pointer
2992 mov 0x70(%rax),%rbp
2993 mov 0x68(%rax),%rbx
2994 mov 0x60(%rax),%r12
2995 mov 0x58(%rax),%r13
2996 mov 0x50(%rax),%r14
2997 mov 0x48(%rax),%r15
2998 lea 0x78(%rax),%rax # adjust stack pointer
2999 mov %rbx,144($context) # restore context->Rbx
3000 mov %rbp,160($context) # restore context->Rbp
3001 mov %r12,216($context) # restore context->R12
3002 mov %r13,224($context) # restore context->R13
3003 mov %r14,232($context) # restore context->R14
3004 mov %r15,240($context) # restore context->R15
3006 .Lin_prologue:
3007 mov %rax,152($context) # restore context->Rsp
3009 mov 40($disp),%rdi # disp->ContextRecord
3010 mov $context,%rsi # context
3011 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3012 .long 0xa548f3fc # cld; rep movsq
3014 mov $disp,%rsi
3015 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3016 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3017 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3018 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3019 mov 40(%rsi),%r10 # disp->ContextRecord
3020 lea 56(%rsi),%r11 # &disp->HandlerData
3021 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3022 mov %r10,32(%rsp) # arg5
3023 mov %r11,40(%rsp) # arg6
3024 mov %r12,48(%rsp) # arg7
3025 mov %rcx,56(%rsp) # arg8, (NULL)
3026 call *__imp_RtlVirtualUnwind(%rip)
3028 mov \$1,%eax # ExceptionContinueSearch
3029 add \$64,%rsp
3030 popfq
3031 pop %r15
3032 pop %r14
3033 pop %r13
3034 pop %r12
3035 pop %rbp
3036 pop %rbx
3037 pop %rdi
3038 pop %rsi
3040 .size se_handler,.-se_handler
3042 .section .pdata
3043 .align 4
3045 $code.=<<___ if ($ecb);
3046 .rva .Lecb_enc_prologue
3047 .rva .Lecb_enc_epilogue
3048 .rva .Lecb_enc_info
3050 .rva .Lecb_dec_prologue
3051 .rva .Lecb_dec_epilogue
3052 .rva .Lecb_dec_info
3054 $code.=<<___;
3055 .rva .Lcbc_dec_prologue
3056 .rva .Lcbc_dec_epilogue
3057 .rva .Lcbc_dec_info
3059 .rva .Lctr_enc_prologue
3060 .rva .Lctr_enc_epilogue
3061 .rva .Lctr_enc_info
3063 .rva .Lxts_enc_prologue
3064 .rva .Lxts_enc_epilogue
3065 .rva .Lxts_enc_info
3067 .rva .Lxts_dec_prologue
3068 .rva .Lxts_dec_epilogue
3069 .rva .Lxts_dec_info
3071 .section .xdata
3072 .align 8
3074 $code.=<<___ if ($ecb);
3075 .Lecb_enc_info:
3076 .byte 9,0,0,0
3077 .rva se_handler
3078 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3079 .Lecb_dec_info:
3080 .byte 9,0,0,0
3081 .rva se_handler
3082 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3084 $code.=<<___;
3085 .Lcbc_dec_info:
3086 .byte 9,0,0,0
3087 .rva se_handler
3088 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3089 .Lctr_enc_info:
3090 .byte 9,0,0,0
3091 .rva se_handler
3092 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3093 .Lxts_enc_info:
3094 .byte 9,0,0,0
3095 .rva se_handler
3096 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3097 .Lxts_dec_info:
3098 .byte 9,0,0,0
3099 .rva se_handler
3100 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3104 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
3106 print $code;
3108 close STDOUT;