if_iwm - Recognize IWM_FW_PAGING_BLOCK_CMD wide cmd response correctly.
[dragonfly.git] / crypto / openssl / crypto / aes / asm / bsaes-x86_64.pl
blob3f7d33c45bce7154a54789eb6c1a6902b17ad3fa
1 #!/usr/bin/env perl
3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
9 ### Public domain ###
10 ### ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
15 # September 2011.
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
38 # Emilia's this(*) difference
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.88 +11%
42 # Atom 17.1 16.4 +4%
43 # Silvermont - 12.9
45 # (*) Comparison is not completely fair, because "this" is ECB,
46 # i.e. no extra processing such as counter values calculation
47 # and xor-ing input as in Emilia's CTR implementation is
48 # performed. However, the CTR calculations stand for not more
49 # than 1% of total time, so comparison is *rather* fair.
51 # (**) Results were collected on Westmere, which is considered to
52 # be equivalent to Nehalem for this code.
54 # As for key schedule conversion subroutine. Interface to OpenSSL
55 # relies on per-invocation on-the-fly conversion. This naturally
56 # has impact on performance, especially for short inputs. Conversion
57 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
58 # function is:
60 # conversion conversion/8x block
61 # Core 2 240 0.22
62 # Nehalem 180 0.20
63 # Atom 430 0.20
65 # The ratio values mean that 128-byte blocks will be processed
66 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
67 # etc. Then keep in mind that input sizes not divisible by 128 are
68 # *effectively* slower, especially shortest ones, e.g. consecutive
69 # 144-byte blocks are processed 44% slower than one would expect,
70 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
71 # it's still faster than ["hyper-threading-safe" code path in]
72 # aes-x86_64.pl on all lengths above 64 bytes...
74 # October 2011.
76 # Add decryption procedure. Performance in CPU cycles spent to decrypt
77 # one byte out of 4096-byte buffer with 128-bit key is:
79 # Core 2 9.98
80 # Nehalem 7.80
81 # Atom 17.9
82 # Silvermont 14.0
84 # November 2011.
86 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
87 # suboptimal, but XTS is meant to be used with larger blocks...
89 # <appro@openssl.org>
91 $flavour = shift;
92 $output = shift;
93 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
95 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
97 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
98 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
99 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
100 die "can't locate x86_64-xlate.pl";
102 open OUT,"| \"$^X\" $xlate $flavour $output";
103 *STDOUT=*OUT;
105 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
106 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
107 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
110 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
112 sub Sbox {
113 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
114 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
115 my @b=@_[0..7];
116 my @t=@_[8..11];
117 my @s=@_[12..15];
118 &InBasisChange (@b);
119 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
120 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
123 sub InBasisChange {
124 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
125 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
126 my @b=@_[0..7];
127 $code.=<<___;
128 pxor @b[6], @b[5]
129 pxor @b[1], @b[2]
130 pxor @b[0], @b[3]
131 pxor @b[2], @b[6]
132 pxor @b[0], @b[5]
134 pxor @b[3], @b[6]
135 pxor @b[7], @b[3]
136 pxor @b[5], @b[7]
137 pxor @b[4], @b[3]
138 pxor @b[5], @b[4]
139 pxor @b[1], @b[3]
141 pxor @b[7], @b[2]
142 pxor @b[5], @b[1]
146 sub OutBasisChange {
147 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
148 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
149 my @b=@_[0..7];
150 $code.=<<___;
151 pxor @b[6], @b[0]
152 pxor @b[4], @b[1]
153 pxor @b[0], @b[2]
154 pxor @b[6], @b[4]
155 pxor @b[1], @b[6]
157 pxor @b[5], @b[1]
158 pxor @b[3], @b[5]
159 pxor @b[7], @b[3]
160 pxor @b[5], @b[7]
161 pxor @b[5], @b[2]
163 pxor @b[7], @b[4]
167 sub InvSbox {
168 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
169 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
170 my @b=@_[0..7];
171 my @t=@_[8..11];
172 my @s=@_[12..15];
173 &InvInBasisChange (@b);
174 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
175 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
178 sub InvInBasisChange { # OutBasisChange in reverse
179 my @b=@_[5,1,2,6,3,7,0,4];
180 $code.=<<___
181 pxor @b[7], @b[4]
183 pxor @b[5], @b[7]
184 pxor @b[5], @b[2]
185 pxor @b[7], @b[3]
186 pxor @b[3], @b[5]
187 pxor @b[5], @b[1]
189 pxor @b[1], @b[6]
190 pxor @b[0], @b[2]
191 pxor @b[6], @b[4]
192 pxor @b[6], @b[0]
193 pxor @b[4], @b[1]
197 sub InvOutBasisChange { # InBasisChange in reverse
198 my @b=@_[2,5,7,3,6,1,0,4];
199 $code.=<<___;
200 pxor @b[5], @b[1]
201 pxor @b[7], @b[2]
203 pxor @b[1], @b[3]
204 pxor @b[5], @b[4]
205 pxor @b[5], @b[7]
206 pxor @b[4], @b[3]
207 pxor @b[0], @b[5]
208 pxor @b[7], @b[3]
209 pxor @b[2], @b[6]
210 pxor @b[1], @b[2]
211 pxor @b[3], @b[6]
213 pxor @b[0], @b[3]
214 pxor @b[6], @b[5]
218 sub Mul_GF4 {
219 #;*************************************************************
220 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
221 #;*************************************************************
222 my ($x0,$x1,$y0,$y1,$t0)=@_;
223 $code.=<<___;
224 movdqa $y0, $t0
225 pxor $y1, $t0
226 pand $x0, $t0
227 pxor $x1, $x0
228 pand $y0, $x1
229 pand $y1, $x0
230 pxor $x1, $x0
231 pxor $t0, $x1
235 sub Mul_GF4_N { # not used, see next subroutine
236 # multiply and scale by N
237 my ($x0,$x1,$y0,$y1,$t0)=@_;
238 $code.=<<___;
239 movdqa $y0, $t0
240 pxor $y1, $t0
241 pand $x0, $t0
242 pxor $x1, $x0
243 pand $y0, $x1
244 pand $y1, $x0
245 pxor $x0, $x1
246 pxor $t0, $x0
250 sub Mul_GF4_N_GF4 {
251 # interleaved Mul_GF4_N and Mul_GF4
252 my ($x0,$x1,$y0,$y1,$t0,
253 $x2,$x3,$y2,$y3,$t1)=@_;
254 $code.=<<___;
255 movdqa $y0, $t0
256 movdqa $y2, $t1
257 pxor $y1, $t0
258 pxor $y3, $t1
259 pand $x0, $t0
260 pand $x2, $t1
261 pxor $x1, $x0
262 pxor $x3, $x2
263 pand $y0, $x1
264 pand $y2, $x3
265 pand $y1, $x0
266 pand $y3, $x2
267 pxor $x0, $x1
268 pxor $x3, $x2
269 pxor $t0, $x0
270 pxor $t1, $x3
273 sub Mul_GF16_2 {
274 my @x=@_[0..7];
275 my @y=@_[8..11];
276 my @t=@_[12..15];
277 $code.=<<___;
278 movdqa @x[0], @t[0]
279 movdqa @x[1], @t[1]
281 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
282 $code.=<<___;
283 pxor @x[2], @t[0]
284 pxor @x[3], @t[1]
285 pxor @y[2], @y[0]
286 pxor @y[3], @y[1]
288 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
289 @x[2], @x[3], @y[2], @y[3], @t[2]);
290 $code.=<<___;
291 pxor @t[0], @x[0]
292 pxor @t[0], @x[2]
293 pxor @t[1], @x[1]
294 pxor @t[1], @x[3]
296 movdqa @x[4], @t[0]
297 movdqa @x[5], @t[1]
298 pxor @x[6], @t[0]
299 pxor @x[7], @t[1]
301 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
302 @x[6], @x[7], @y[2], @y[3], @t[2]);
303 $code.=<<___;
304 pxor @y[2], @y[0]
305 pxor @y[3], @y[1]
307 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
308 $code.=<<___;
309 pxor @t[0], @x[4]
310 pxor @t[0], @x[6]
311 pxor @t[1], @x[5]
312 pxor @t[1], @x[7]
315 sub Inv_GF256 {
316 #;********************************************************************
317 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
318 #;********************************************************************
319 my @x=@_[0..7];
320 my @t=@_[8..11];
321 my @s=@_[12..15];
322 # direct optimizations from hardware
323 $code.=<<___;
324 movdqa @x[4], @t[3]
325 movdqa @x[5], @t[2]
326 movdqa @x[1], @t[1]
327 movdqa @x[7], @s[1]
328 movdqa @x[0], @s[0]
330 pxor @x[6], @t[3]
331 pxor @x[7], @t[2]
332 pxor @x[3], @t[1]
333 movdqa @t[3], @s[2]
334 pxor @x[6], @s[1]
335 movdqa @t[2], @t[0]
336 pxor @x[2], @s[0]
337 movdqa @t[3], @s[3]
339 por @t[1], @t[2]
340 por @s[0], @t[3]
341 pxor @t[0], @s[3]
342 pand @s[0], @s[2]
343 pxor @t[1], @s[0]
344 pand @t[1], @t[0]
345 pand @s[0], @s[3]
346 movdqa @x[3], @s[0]
347 pxor @x[2], @s[0]
348 pand @s[0], @s[1]
349 pxor @s[1], @t[3]
350 pxor @s[1], @t[2]
351 movdqa @x[4], @s[1]
352 movdqa @x[1], @s[0]
353 pxor @x[5], @s[1]
354 pxor @x[0], @s[0]
355 movdqa @s[1], @t[1]
356 pand @s[0], @s[1]
357 por @s[0], @t[1]
358 pxor @s[1], @t[0]
359 pxor @s[3], @t[3]
360 pxor @s[2], @t[2]
361 pxor @s[3], @t[1]
362 movdqa @x[7], @s[0]
363 pxor @s[2], @t[0]
364 movdqa @x[6], @s[1]
365 pxor @s[2], @t[1]
366 movdqa @x[5], @s[2]
367 pand @x[3], @s[0]
368 movdqa @x[4], @s[3]
369 pand @x[2], @s[1]
370 pand @x[1], @s[2]
371 por @x[0], @s[3]
372 pxor @s[0], @t[3]
373 pxor @s[1], @t[2]
374 pxor @s[2], @t[1]
375 pxor @s[3], @t[0]
377 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
379 # new smaller inversion
381 movdqa @t[3], @s[0]
382 pand @t[1], @t[3]
383 pxor @t[2], @s[0]
385 movdqa @t[0], @s[2]
386 movdqa @s[0], @s[3]
387 pxor @t[3], @s[2]
388 pand @s[2], @s[3]
390 movdqa @t[1], @s[1]
391 pxor @t[2], @s[3]
392 pxor @t[0], @s[1]
394 pxor @t[2], @t[3]
396 pand @t[3], @s[1]
398 movdqa @s[2], @t[2]
399 pxor @t[0], @s[1]
401 pxor @s[1], @t[2]
402 pxor @s[1], @t[1]
404 pand @t[0], @t[2]
406 pxor @t[2], @s[2]
407 pxor @t[2], @t[1]
409 pand @s[3], @s[2]
411 pxor @s[0], @s[2]
413 # output in s3, s2, s1, t1
415 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
417 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
418 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
420 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
423 # AES linear components
425 sub ShiftRows {
426 my @x=@_[0..7];
427 my $mask=pop;
428 $code.=<<___;
429 pxor 0x00($key),@x[0]
430 pxor 0x10($key),@x[1]
431 pxor 0x20($key),@x[2]
432 pxor 0x30($key),@x[3]
433 pshufb $mask,@x[0]
434 pshufb $mask,@x[1]
435 pxor 0x40($key),@x[4]
436 pxor 0x50($key),@x[5]
437 pshufb $mask,@x[2]
438 pshufb $mask,@x[3]
439 pxor 0x60($key),@x[6]
440 pxor 0x70($key),@x[7]
441 pshufb $mask,@x[4]
442 pshufb $mask,@x[5]
443 pshufb $mask,@x[6]
444 pshufb $mask,@x[7]
445 lea 0x80($key),$key
449 sub MixColumns {
450 # modified to emit output in order suitable for feeding back to aesenc[last]
451 my @x=@_[0..7];
452 my @t=@_[8..15];
453 my $inv=@_[16]; # optional
454 $code.=<<___;
455 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
456 pshufd \$0x93, @x[1], @t[1]
457 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
458 pshufd \$0x93, @x[2], @t[2]
459 pxor @t[1], @x[1]
460 pshufd \$0x93, @x[3], @t[3]
461 pxor @t[2], @x[2]
462 pshufd \$0x93, @x[4], @t[4]
463 pxor @t[3], @x[3]
464 pshufd \$0x93, @x[5], @t[5]
465 pxor @t[4], @x[4]
466 pshufd \$0x93, @x[6], @t[6]
467 pxor @t[5], @x[5]
468 pshufd \$0x93, @x[7], @t[7]
469 pxor @t[6], @x[6]
470 pxor @t[7], @x[7]
472 pxor @x[0], @t[1]
473 pxor @x[7], @t[0]
474 pxor @x[7], @t[1]
475 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
476 pxor @x[1], @t[2]
477 pshufd \$0x4E, @x[1], @x[1]
478 pxor @x[4], @t[5]
479 pxor @t[0], @x[0]
480 pxor @x[5], @t[6]
481 pxor @t[1], @x[1]
482 pxor @x[3], @t[4]
483 pshufd \$0x4E, @x[4], @t[0]
484 pxor @x[6], @t[7]
485 pshufd \$0x4E, @x[5], @t[1]
486 pxor @x[2], @t[3]
487 pshufd \$0x4E, @x[3], @x[4]
488 pxor @x[7], @t[3]
489 pshufd \$0x4E, @x[7], @x[5]
490 pxor @x[7], @t[4]
491 pshufd \$0x4E, @x[6], @x[3]
492 pxor @t[4], @t[0]
493 pshufd \$0x4E, @x[2], @x[6]
494 pxor @t[5], @t[1]
496 $code.=<<___ if (!$inv);
497 pxor @t[3], @x[4]
498 pxor @t[7], @x[5]
499 pxor @t[6], @x[3]
500 movdqa @t[0], @x[2]
501 pxor @t[2], @x[6]
502 movdqa @t[1], @x[7]
504 $code.=<<___ if ($inv);
505 pxor @x[4], @t[3]
506 pxor @t[7], @x[5]
507 pxor @x[3], @t[6]
508 movdqa @t[0], @x[3]
509 pxor @t[2], @x[6]
510 movdqa @t[6], @x[2]
511 movdqa @t[1], @x[7]
512 movdqa @x[6], @x[4]
513 movdqa @t[3], @x[6]
517 sub InvMixColumns_orig {
518 my @x=@_[0..7];
519 my @t=@_[8..15];
521 $code.=<<___;
522 # multiplication by 0x0e
523 pshufd \$0x93, @x[7], @t[7]
524 movdqa @x[2], @t[2]
525 pxor @x[5], @x[7] # 7 5
526 pxor @x[5], @x[2] # 2 5
527 pshufd \$0x93, @x[0], @t[0]
528 movdqa @x[5], @t[5]
529 pxor @x[0], @x[5] # 5 0 [1]
530 pxor @x[1], @x[0] # 0 1
531 pshufd \$0x93, @x[1], @t[1]
532 pxor @x[2], @x[1] # 1 25
533 pxor @x[6], @x[0] # 01 6 [2]
534 pxor @x[3], @x[1] # 125 3 [4]
535 pshufd \$0x93, @x[3], @t[3]
536 pxor @x[0], @x[2] # 25 016 [3]
537 pxor @x[7], @x[3] # 3 75
538 pxor @x[6], @x[7] # 75 6 [0]
539 pshufd \$0x93, @x[6], @t[6]
540 movdqa @x[4], @t[4]
541 pxor @x[4], @x[6] # 6 4
542 pxor @x[3], @x[4] # 4 375 [6]
543 pxor @x[7], @x[3] # 375 756=36
544 pxor @t[5], @x[6] # 64 5 [7]
545 pxor @t[2], @x[3] # 36 2
546 pxor @t[4], @x[3] # 362 4 [5]
547 pshufd \$0x93, @t[5], @t[5]
549 my @y = @x[7,5,0,2,1,3,4,6];
550 $code.=<<___;
551 # multiplication by 0x0b
552 pxor @y[0], @y[1]
553 pxor @t[0], @y[0]
554 pxor @t[1], @y[1]
555 pshufd \$0x93, @t[2], @t[2]
556 pxor @t[5], @y[0]
557 pxor @t[6], @y[1]
558 pxor @t[7], @y[0]
559 pshufd \$0x93, @t[4], @t[4]
560 pxor @t[6], @t[7] # clobber t[7]
561 pxor @y[0], @y[1]
563 pxor @t[0], @y[3]
564 pshufd \$0x93, @t[0], @t[0]
565 pxor @t[1], @y[2]
566 pxor @t[1], @y[4]
567 pxor @t[2], @y[2]
568 pshufd \$0x93, @t[1], @t[1]
569 pxor @t[2], @y[3]
570 pxor @t[2], @y[5]
571 pxor @t[7], @y[2]
572 pshufd \$0x93, @t[2], @t[2]
573 pxor @t[3], @y[3]
574 pxor @t[3], @y[6]
575 pxor @t[3], @y[4]
576 pshufd \$0x93, @t[3], @t[3]
577 pxor @t[4], @y[7]
578 pxor @t[4], @y[5]
579 pxor @t[7], @y[7]
580 pxor @t[5], @y[3]
581 pxor @t[4], @y[4]
582 pxor @t[5], @t[7] # clobber t[7] even more
584 pxor @t[7], @y[5]
585 pshufd \$0x93, @t[4], @t[4]
586 pxor @t[7], @y[6]
587 pxor @t[7], @y[4]
589 pxor @t[5], @t[7]
590 pshufd \$0x93, @t[5], @t[5]
591 pxor @t[6], @t[7] # restore t[7]
593 # multiplication by 0x0d
594 pxor @y[7], @y[4]
595 pxor @t[4], @y[7]
596 pshufd \$0x93, @t[6], @t[6]
597 pxor @t[0], @y[2]
598 pxor @t[5], @y[7]
599 pxor @t[2], @y[2]
600 pshufd \$0x93, @t[7], @t[7]
602 pxor @y[1], @y[3]
603 pxor @t[1], @y[1]
604 pxor @t[0], @y[0]
605 pxor @t[0], @y[3]
606 pxor @t[5], @y[1]
607 pxor @t[5], @y[0]
608 pxor @t[7], @y[1]
609 pshufd \$0x93, @t[0], @t[0]
610 pxor @t[6], @y[0]
611 pxor @y[1], @y[3]
612 pxor @t[1], @y[4]
613 pshufd \$0x93, @t[1], @t[1]
615 pxor @t[7], @y[7]
616 pxor @t[2], @y[4]
617 pxor @t[2], @y[5]
618 pshufd \$0x93, @t[2], @t[2]
619 pxor @t[6], @y[2]
620 pxor @t[3], @t[6] # clobber t[6]
621 pxor @y[7], @y[4]
622 pxor @t[6], @y[3]
624 pxor @t[6], @y[6]
625 pxor @t[5], @y[5]
626 pxor @t[4], @y[6]
627 pshufd \$0x93, @t[4], @t[4]
628 pxor @t[6], @y[5]
629 pxor @t[7], @y[6]
630 pxor @t[3], @t[6] # restore t[6]
632 pshufd \$0x93, @t[5], @t[5]
633 pshufd \$0x93, @t[6], @t[6]
634 pshufd \$0x93, @t[7], @t[7]
635 pshufd \$0x93, @t[3], @t[3]
637 # multiplication by 0x09
638 pxor @y[1], @y[4]
639 pxor @y[1], @t[1] # t[1]=y[1]
640 pxor @t[5], @t[0] # clobber t[0]
641 pxor @t[5], @t[1]
642 pxor @t[0], @y[3]
643 pxor @y[0], @t[0] # t[0]=y[0]
644 pxor @t[6], @t[1]
645 pxor @t[7], @t[6] # clobber t[6]
646 pxor @t[1], @y[4]
647 pxor @t[4], @y[7]
648 pxor @y[4], @t[4] # t[4]=y[4]
649 pxor @t[3], @y[6]
650 pxor @y[3], @t[3] # t[3]=y[3]
651 pxor @t[2], @y[5]
652 pxor @y[2], @t[2] # t[2]=y[2]
653 pxor @t[7], @t[3]
654 pxor @y[5], @t[5] # t[5]=y[5]
655 pxor @t[6], @t[2]
656 pxor @t[6], @t[5]
657 pxor @y[6], @t[6] # t[6]=y[6]
658 pxor @y[7], @t[7] # t[7]=y[7]
660 movdqa @t[0],@XMM[0]
661 movdqa @t[1],@XMM[1]
662 movdqa @t[2],@XMM[2]
663 movdqa @t[3],@XMM[3]
664 movdqa @t[4],@XMM[4]
665 movdqa @t[5],@XMM[5]
666 movdqa @t[6],@XMM[6]
667 movdqa @t[7],@XMM[7]
671 sub InvMixColumns {
672 my @x=@_[0..7];
673 my @t=@_[8..15];
675 # Thanks to Jussi Kivilinna for providing pointer to
677 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
678 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
679 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
680 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
682 $code.=<<___;
683 # multiplication by 0x05-0x00-0x04-0x00
684 pshufd \$0x4E, @x[0], @t[0]
685 pshufd \$0x4E, @x[6], @t[6]
686 pxor @x[0], @t[0]
687 pshufd \$0x4E, @x[7], @t[7]
688 pxor @x[6], @t[6]
689 pshufd \$0x4E, @x[1], @t[1]
690 pxor @x[7], @t[7]
691 pshufd \$0x4E, @x[2], @t[2]
692 pxor @x[1], @t[1]
693 pshufd \$0x4E, @x[3], @t[3]
694 pxor @x[2], @t[2]
695 pxor @t[6], @x[0]
696 pxor @t[6], @x[1]
697 pshufd \$0x4E, @x[4], @t[4]
698 pxor @x[3], @t[3]
699 pxor @t[0], @x[2]
700 pxor @t[1], @x[3]
701 pshufd \$0x4E, @x[5], @t[5]
702 pxor @x[4], @t[4]
703 pxor @t[7], @x[1]
704 pxor @t[2], @x[4]
705 pxor @x[5], @t[5]
707 pxor @t[7], @x[2]
708 pxor @t[6], @x[3]
709 pxor @t[6], @x[4]
710 pxor @t[3], @x[5]
711 pxor @t[4], @x[6]
712 pxor @t[7], @x[4]
713 pxor @t[7], @x[5]
714 pxor @t[5], @x[7]
716 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
719 sub aesenc { # not used
720 my @b=@_[0..7];
721 my @t=@_[8..15];
722 $code.=<<___;
723 movdqa 0x30($const),@t[0] # .LSR
725 &ShiftRows (@b,@t[0]);
726 &Sbox (@b,@t);
727 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
730 sub aesenclast { # not used
731 my @b=@_[0..7];
732 my @t=@_[8..15];
733 $code.=<<___;
734 movdqa 0x40($const),@t[0] # .LSRM0
736 &ShiftRows (@b,@t[0]);
737 &Sbox (@b,@t);
738 $code.=<<___
739 pxor 0x00($key),@b[0]
740 pxor 0x10($key),@b[1]
741 pxor 0x20($key),@b[4]
742 pxor 0x30($key),@b[6]
743 pxor 0x40($key),@b[3]
744 pxor 0x50($key),@b[7]
745 pxor 0x60($key),@b[2]
746 pxor 0x70($key),@b[5]
750 sub swapmove {
751 my ($a,$b,$n,$mask,$t)=@_;
752 $code.=<<___;
753 movdqa $b,$t
754 psrlq \$$n,$b
755 pxor $a,$b
756 pand $mask,$b
757 pxor $b,$a
758 psllq \$$n,$b
759 pxor $t,$b
762 sub swapmove2x {
763 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
764 $code.=<<___;
765 movdqa $b0,$t0
766 psrlq \$$n,$b0
767 movdqa $b1,$t1
768 psrlq \$$n,$b1
769 pxor $a0,$b0
770 pxor $a1,$b1
771 pand $mask,$b0
772 pand $mask,$b1
773 pxor $b0,$a0
774 psllq \$$n,$b0
775 pxor $b1,$a1
776 psllq \$$n,$b1
777 pxor $t0,$b0
778 pxor $t1,$b1
782 sub bitslice {
783 my @x=reverse(@_[0..7]);
784 my ($t0,$t1,$t2,$t3)=@_[8..11];
785 $code.=<<___;
786 movdqa 0x00($const),$t0 # .LBS0
787 movdqa 0x10($const),$t1 # .LBS1
789 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
790 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
791 $code.=<<___;
792 movdqa 0x20($const),$t0 # .LBS2
794 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
795 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
797 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
798 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
801 $code.=<<___;
802 .text
804 .extern asm_AES_encrypt
805 .extern asm_AES_decrypt
807 .type _bsaes_encrypt8,\@abi-omnipotent
808 .align 64
809 _bsaes_encrypt8:
810 lea .LBS0(%rip), $const # constants table
812 movdqa ($key), @XMM[9] # round 0 key
813 lea 0x10($key), $key
814 movdqa 0x50($const), @XMM[8] # .LM0SR
815 pxor @XMM[9], @XMM[0] # xor with round0 key
816 pxor @XMM[9], @XMM[1]
817 pxor @XMM[9], @XMM[2]
818 pxor @XMM[9], @XMM[3]
819 pshufb @XMM[8], @XMM[0]
820 pshufb @XMM[8], @XMM[1]
821 pxor @XMM[9], @XMM[4]
822 pxor @XMM[9], @XMM[5]
823 pshufb @XMM[8], @XMM[2]
824 pshufb @XMM[8], @XMM[3]
825 pxor @XMM[9], @XMM[6]
826 pxor @XMM[9], @XMM[7]
827 pshufb @XMM[8], @XMM[4]
828 pshufb @XMM[8], @XMM[5]
829 pshufb @XMM[8], @XMM[6]
830 pshufb @XMM[8], @XMM[7]
831 _bsaes_encrypt8_bitslice:
833 &bitslice (@XMM[0..7, 8..11]);
834 $code.=<<___;
835 dec $rounds
836 jmp .Lenc_sbox
837 .align 16
838 .Lenc_loop:
840 &ShiftRows (@XMM[0..7, 8]);
841 $code.=".Lenc_sbox:\n";
842 &Sbox (@XMM[0..7, 8..15]);
843 $code.=<<___;
844 dec $rounds
845 jl .Lenc_done
847 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
848 $code.=<<___;
849 movdqa 0x30($const), @XMM[8] # .LSR
850 jnz .Lenc_loop
851 movdqa 0x40($const), @XMM[8] # .LSRM0
852 jmp .Lenc_loop
853 .align 16
854 .Lenc_done:
856 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
857 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
858 $code.=<<___;
859 movdqa ($key), @XMM[8] # last round key
860 pxor @XMM[8], @XMM[4]
861 pxor @XMM[8], @XMM[6]
862 pxor @XMM[8], @XMM[3]
863 pxor @XMM[8], @XMM[7]
864 pxor @XMM[8], @XMM[2]
865 pxor @XMM[8], @XMM[5]
866 pxor @XMM[8], @XMM[0]
867 pxor @XMM[8], @XMM[1]
869 .size _bsaes_encrypt8,.-_bsaes_encrypt8
871 .type _bsaes_decrypt8,\@abi-omnipotent
872 .align 64
873 _bsaes_decrypt8:
874 lea .LBS0(%rip), $const # constants table
876 movdqa ($key), @XMM[9] # round 0 key
877 lea 0x10($key), $key
878 movdqa -0x30($const), @XMM[8] # .LM0ISR
879 pxor @XMM[9], @XMM[0] # xor with round0 key
880 pxor @XMM[9], @XMM[1]
881 pxor @XMM[9], @XMM[2]
882 pxor @XMM[9], @XMM[3]
883 pshufb @XMM[8], @XMM[0]
884 pshufb @XMM[8], @XMM[1]
885 pxor @XMM[9], @XMM[4]
886 pxor @XMM[9], @XMM[5]
887 pshufb @XMM[8], @XMM[2]
888 pshufb @XMM[8], @XMM[3]
889 pxor @XMM[9], @XMM[6]
890 pxor @XMM[9], @XMM[7]
891 pshufb @XMM[8], @XMM[4]
892 pshufb @XMM[8], @XMM[5]
893 pshufb @XMM[8], @XMM[6]
894 pshufb @XMM[8], @XMM[7]
896 &bitslice (@XMM[0..7, 8..11]);
897 $code.=<<___;
898 dec $rounds
899 jmp .Ldec_sbox
900 .align 16
901 .Ldec_loop:
903 &ShiftRows (@XMM[0..7, 8]);
904 $code.=".Ldec_sbox:\n";
905 &InvSbox (@XMM[0..7, 8..15]);
906 $code.=<<___;
907 dec $rounds
908 jl .Ldec_done
910 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
911 $code.=<<___;
912 movdqa -0x10($const), @XMM[8] # .LISR
913 jnz .Ldec_loop
914 movdqa -0x20($const), @XMM[8] # .LISRM0
915 jmp .Ldec_loop
916 .align 16
917 .Ldec_done:
919 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
920 $code.=<<___;
921 movdqa ($key), @XMM[8] # last round key
922 pxor @XMM[8], @XMM[6]
923 pxor @XMM[8], @XMM[4]
924 pxor @XMM[8], @XMM[2]
925 pxor @XMM[8], @XMM[7]
926 pxor @XMM[8], @XMM[3]
927 pxor @XMM[8], @XMM[5]
928 pxor @XMM[8], @XMM[0]
929 pxor @XMM[8], @XMM[1]
931 .size _bsaes_decrypt8,.-_bsaes_decrypt8
935 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
937 sub bitslice_key {
938 my @x=reverse(@_[0..7]);
939 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
941 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
942 $code.=<<___;
943 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
944 movdqa @x[0], @x[2]
945 movdqa @x[1], @x[3]
947 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
949 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
950 $code.=<<___;
951 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
952 movdqa @x[0], @x[4]
953 movdqa @x[2], @x[6]
954 movdqa @x[1], @x[5]
955 movdqa @x[3], @x[7]
957 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
958 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
961 $code.=<<___;
962 .type _bsaes_key_convert,\@abi-omnipotent
963 .align 16
964 _bsaes_key_convert:
965 lea .Lmasks(%rip), $const
966 movdqu ($inp), %xmm7 # load round 0 key
967 lea 0x10($inp), $inp
968 movdqa 0x00($const), %xmm0 # 0x01...
969 movdqa 0x10($const), %xmm1 # 0x02...
970 movdqa 0x20($const), %xmm2 # 0x04...
971 movdqa 0x30($const), %xmm3 # 0x08...
972 movdqa 0x40($const), %xmm4 # .LM0
973 pcmpeqd %xmm5, %xmm5 # .LNOT
975 movdqu ($inp), %xmm6 # load round 1 key
976 movdqa %xmm7, ($out) # save round 0 key
977 lea 0x10($out), $out
978 dec $rounds
979 jmp .Lkey_loop
980 .align 16
981 .Lkey_loop:
982 pshufb %xmm4, %xmm6 # .LM0
984 movdqa %xmm0, %xmm8
985 movdqa %xmm1, %xmm9
987 pand %xmm6, %xmm8
988 pand %xmm6, %xmm9
989 movdqa %xmm2, %xmm10
990 pcmpeqb %xmm0, %xmm8
991 psllq \$4, %xmm0 # 0x10...
992 movdqa %xmm3, %xmm11
993 pcmpeqb %xmm1, %xmm9
994 psllq \$4, %xmm1 # 0x20...
996 pand %xmm6, %xmm10
997 pand %xmm6, %xmm11
998 movdqa %xmm0, %xmm12
999 pcmpeqb %xmm2, %xmm10
1000 psllq \$4, %xmm2 # 0x40...
1001 movdqa %xmm1, %xmm13
1002 pcmpeqb %xmm3, %xmm11
1003 psllq \$4, %xmm3 # 0x80...
1005 movdqa %xmm2, %xmm14
1006 movdqa %xmm3, %xmm15
1007 pxor %xmm5, %xmm8 # "pnot"
1008 pxor %xmm5, %xmm9
1010 pand %xmm6, %xmm12
1011 pand %xmm6, %xmm13
1012 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1013 pcmpeqb %xmm0, %xmm12
1014 psrlq \$4, %xmm0 # 0x01...
1015 movdqa %xmm9, 0x10($out)
1016 pcmpeqb %xmm1, %xmm13
1017 psrlq \$4, %xmm1 # 0x02...
1018 lea 0x10($inp), $inp
1020 pand %xmm6, %xmm14
1021 pand %xmm6, %xmm15
1022 movdqa %xmm10, 0x20($out)
1023 pcmpeqb %xmm2, %xmm14
1024 psrlq \$4, %xmm2 # 0x04...
1025 movdqa %xmm11, 0x30($out)
1026 pcmpeqb %xmm3, %xmm15
1027 psrlq \$4, %xmm3 # 0x08...
1028 movdqu ($inp), %xmm6 # load next round key
1030 pxor %xmm5, %xmm13 # "pnot"
1031 pxor %xmm5, %xmm14
1032 movdqa %xmm12, 0x40($out)
1033 movdqa %xmm13, 0x50($out)
1034 movdqa %xmm14, 0x60($out)
1035 movdqa %xmm15, 0x70($out)
1036 lea 0x80($out),$out
1037 dec $rounds
1038 jnz .Lkey_loop
1040 movdqa 0x50($const), %xmm7 # .L63
1041 #movdqa %xmm6, ($out) # don't save last round key
1043 .size _bsaes_key_convert,.-_bsaes_key_convert
1047 if (0 && !$win64) { # following four functions are unsupported interface
1048 # used for benchmarking...
1049 $code.=<<___;
1050 .globl bsaes_enc_key_convert
1051 .type bsaes_enc_key_convert,\@function,2
1052 .align 16
1053 bsaes_enc_key_convert:
1054 mov 240($inp),%r10d # pass rounds
1055 mov $inp,%rcx # pass key
1056 mov $out,%rax # pass key schedule
1057 call _bsaes_key_convert
1058 pxor %xmm6,%xmm7 # fix up last round key
1059 movdqa %xmm7,(%rax) # save last round key
1061 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1063 .globl bsaes_encrypt_128
1064 .type bsaes_encrypt_128,\@function,4
1065 .align 16
1066 bsaes_encrypt_128:
1067 .Lenc128_loop:
1068 movdqu 0x00($inp), @XMM[0] # load input
1069 movdqu 0x10($inp), @XMM[1]
1070 movdqu 0x20($inp), @XMM[2]
1071 movdqu 0x30($inp), @XMM[3]
1072 movdqu 0x40($inp), @XMM[4]
1073 movdqu 0x50($inp), @XMM[5]
1074 movdqu 0x60($inp), @XMM[6]
1075 movdqu 0x70($inp), @XMM[7]
1076 mov $key, %rax # pass the $key
1077 lea 0x80($inp), $inp
1078 mov \$10,%r10d
1080 call _bsaes_encrypt8
1082 movdqu @XMM[0], 0x00($out) # write output
1083 movdqu @XMM[1], 0x10($out)
1084 movdqu @XMM[4], 0x20($out)
1085 movdqu @XMM[6], 0x30($out)
1086 movdqu @XMM[3], 0x40($out)
1087 movdqu @XMM[7], 0x50($out)
1088 movdqu @XMM[2], 0x60($out)
1089 movdqu @XMM[5], 0x70($out)
1090 lea 0x80($out), $out
1091 sub \$0x80,$len
1092 ja .Lenc128_loop
1094 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1096 .globl bsaes_dec_key_convert
1097 .type bsaes_dec_key_convert,\@function,2
1098 .align 16
1099 bsaes_dec_key_convert:
1100 mov 240($inp),%r10d # pass rounds
1101 mov $inp,%rcx # pass key
1102 mov $out,%rax # pass key schedule
1103 call _bsaes_key_convert
1104 pxor ($out),%xmm7 # fix up round 0 key
1105 movdqa %xmm6,(%rax) # save last round key
1106 movdqa %xmm7,($out)
1108 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1110 .globl bsaes_decrypt_128
1111 .type bsaes_decrypt_128,\@function,4
1112 .align 16
1113 bsaes_decrypt_128:
1114 .Ldec128_loop:
1115 movdqu 0x00($inp), @XMM[0] # load input
1116 movdqu 0x10($inp), @XMM[1]
1117 movdqu 0x20($inp), @XMM[2]
1118 movdqu 0x30($inp), @XMM[3]
1119 movdqu 0x40($inp), @XMM[4]
1120 movdqu 0x50($inp), @XMM[5]
1121 movdqu 0x60($inp), @XMM[6]
1122 movdqu 0x70($inp), @XMM[7]
1123 mov $key, %rax # pass the $key
1124 lea 0x80($inp), $inp
1125 mov \$10,%r10d
1127 call _bsaes_decrypt8
1129 movdqu @XMM[0], 0x00($out) # write output
1130 movdqu @XMM[1], 0x10($out)
1131 movdqu @XMM[6], 0x20($out)
1132 movdqu @XMM[4], 0x30($out)
1133 movdqu @XMM[2], 0x40($out)
1134 movdqu @XMM[7], 0x50($out)
1135 movdqu @XMM[3], 0x60($out)
1136 movdqu @XMM[5], 0x70($out)
1137 lea 0x80($out), $out
1138 sub \$0x80,$len
1139 ja .Ldec128_loop
1141 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1145 ######################################################################
1147 # OpenSSL interface
1149 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1150 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1151 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1153 if ($ecb) {
1154 $code.=<<___;
1155 .globl bsaes_ecb_encrypt_blocks
1156 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1157 .align 16
1158 bsaes_ecb_encrypt_blocks:
1159 mov %rsp, %rax
1160 .Lecb_enc_prologue:
1161 push %rbp
1162 push %rbx
1163 push %r12
1164 push %r13
1165 push %r14
1166 push %r15
1167 lea -0x48(%rsp),%rsp
1169 $code.=<<___ if ($win64);
1170 lea -0xa0(%rsp), %rsp
1171 movaps %xmm6, 0x40(%rsp)
1172 movaps %xmm7, 0x50(%rsp)
1173 movaps %xmm8, 0x60(%rsp)
1174 movaps %xmm9, 0x70(%rsp)
1175 movaps %xmm10, 0x80(%rsp)
1176 movaps %xmm11, 0x90(%rsp)
1177 movaps %xmm12, 0xa0(%rsp)
1178 movaps %xmm13, 0xb0(%rsp)
1179 movaps %xmm14, 0xc0(%rsp)
1180 movaps %xmm15, 0xd0(%rsp)
1181 .Lecb_enc_body:
1183 $code.=<<___;
1184 mov %rsp,%rbp # backup %rsp
1185 mov 240($arg4),%eax # rounds
1186 mov $arg1,$inp # backup arguments
1187 mov $arg2,$out
1188 mov $arg3,$len
1189 mov $arg4,$key
1190 cmp \$8,$arg3
1191 jb .Lecb_enc_short
1193 mov %eax,%ebx # backup rounds
1194 shl \$7,%rax # 128 bytes per inner round key
1195 sub \$`128-32`,%rax # size of bit-sliced key schedule
1196 sub %rax,%rsp
1197 mov %rsp,%rax # pass key schedule
1198 mov $key,%rcx # pass key
1199 mov %ebx,%r10d # pass rounds
1200 call _bsaes_key_convert
1201 pxor %xmm6,%xmm7 # fix up last round key
1202 movdqa %xmm7,(%rax) # save last round key
1204 sub \$8,$len
1205 .Lecb_enc_loop:
1206 movdqu 0x00($inp), @XMM[0] # load input
1207 movdqu 0x10($inp), @XMM[1]
1208 movdqu 0x20($inp), @XMM[2]
1209 movdqu 0x30($inp), @XMM[3]
1210 movdqu 0x40($inp), @XMM[4]
1211 movdqu 0x50($inp), @XMM[5]
1212 mov %rsp, %rax # pass key schedule
1213 movdqu 0x60($inp), @XMM[6]
1214 mov %ebx,%r10d # pass rounds
1215 movdqu 0x70($inp), @XMM[7]
1216 lea 0x80($inp), $inp
1218 call _bsaes_encrypt8
1220 movdqu @XMM[0], 0x00($out) # write output
1221 movdqu @XMM[1], 0x10($out)
1222 movdqu @XMM[4], 0x20($out)
1223 movdqu @XMM[6], 0x30($out)
1224 movdqu @XMM[3], 0x40($out)
1225 movdqu @XMM[7], 0x50($out)
1226 movdqu @XMM[2], 0x60($out)
1227 movdqu @XMM[5], 0x70($out)
1228 lea 0x80($out), $out
1229 sub \$8,$len
1230 jnc .Lecb_enc_loop
1232 add \$8,$len
1233 jz .Lecb_enc_done
1235 movdqu 0x00($inp), @XMM[0] # load input
1236 mov %rsp, %rax # pass key schedule
1237 mov %ebx,%r10d # pass rounds
1238 cmp \$2,$len
1239 jb .Lecb_enc_one
1240 movdqu 0x10($inp), @XMM[1]
1241 je .Lecb_enc_two
1242 movdqu 0x20($inp), @XMM[2]
1243 cmp \$4,$len
1244 jb .Lecb_enc_three
1245 movdqu 0x30($inp), @XMM[3]
1246 je .Lecb_enc_four
1247 movdqu 0x40($inp), @XMM[4]
1248 cmp \$6,$len
1249 jb .Lecb_enc_five
1250 movdqu 0x50($inp), @XMM[5]
1251 je .Lecb_enc_six
1252 movdqu 0x60($inp), @XMM[6]
1253 call _bsaes_encrypt8
1254 movdqu @XMM[0], 0x00($out) # write output
1255 movdqu @XMM[1], 0x10($out)
1256 movdqu @XMM[4], 0x20($out)
1257 movdqu @XMM[6], 0x30($out)
1258 movdqu @XMM[3], 0x40($out)
1259 movdqu @XMM[7], 0x50($out)
1260 movdqu @XMM[2], 0x60($out)
1261 jmp .Lecb_enc_done
1262 .align 16
1263 .Lecb_enc_six:
1264 call _bsaes_encrypt8
1265 movdqu @XMM[0], 0x00($out) # write output
1266 movdqu @XMM[1], 0x10($out)
1267 movdqu @XMM[4], 0x20($out)
1268 movdqu @XMM[6], 0x30($out)
1269 movdqu @XMM[3], 0x40($out)
1270 movdqu @XMM[7], 0x50($out)
1271 jmp .Lecb_enc_done
1272 .align 16
1273 .Lecb_enc_five:
1274 call _bsaes_encrypt8
1275 movdqu @XMM[0], 0x00($out) # write output
1276 movdqu @XMM[1], 0x10($out)
1277 movdqu @XMM[4], 0x20($out)
1278 movdqu @XMM[6], 0x30($out)
1279 movdqu @XMM[3], 0x40($out)
1280 jmp .Lecb_enc_done
1281 .align 16
1282 .Lecb_enc_four:
1283 call _bsaes_encrypt8
1284 movdqu @XMM[0], 0x00($out) # write output
1285 movdqu @XMM[1], 0x10($out)
1286 movdqu @XMM[4], 0x20($out)
1287 movdqu @XMM[6], 0x30($out)
1288 jmp .Lecb_enc_done
1289 .align 16
1290 .Lecb_enc_three:
1291 call _bsaes_encrypt8
1292 movdqu @XMM[0], 0x00($out) # write output
1293 movdqu @XMM[1], 0x10($out)
1294 movdqu @XMM[4], 0x20($out)
1295 jmp .Lecb_enc_done
1296 .align 16
1297 .Lecb_enc_two:
1298 call _bsaes_encrypt8
1299 movdqu @XMM[0], 0x00($out) # write output
1300 movdqu @XMM[1], 0x10($out)
1301 jmp .Lecb_enc_done
1302 .align 16
1303 .Lecb_enc_one:
1304 call _bsaes_encrypt8
1305 movdqu @XMM[0], 0x00($out) # write output
1306 jmp .Lecb_enc_done
1307 .align 16
1308 .Lecb_enc_short:
1309 lea ($inp), $arg1
1310 lea ($out), $arg2
1311 lea ($key), $arg3
1312 call asm_AES_encrypt
1313 lea 16($inp), $inp
1314 lea 16($out), $out
1315 dec $len
1316 jnz .Lecb_enc_short
1318 .Lecb_enc_done:
1319 lea (%rsp),%rax
1320 pxor %xmm0, %xmm0
1321 .Lecb_enc_bzero: # wipe key schedule [if any]
1322 movdqa %xmm0, 0x00(%rax)
1323 movdqa %xmm0, 0x10(%rax)
1324 lea 0x20(%rax), %rax
1325 cmp %rax, %rbp
1326 jb .Lecb_enc_bzero
1328 lea (%rbp),%rsp # restore %rsp
1330 $code.=<<___ if ($win64);
1331 movaps 0x40(%rbp), %xmm6
1332 movaps 0x50(%rbp), %xmm7
1333 movaps 0x60(%rbp), %xmm8
1334 movaps 0x70(%rbp), %xmm9
1335 movaps 0x80(%rbp), %xmm10
1336 movaps 0x90(%rbp), %xmm11
1337 movaps 0xa0(%rbp), %xmm12
1338 movaps 0xb0(%rbp), %xmm13
1339 movaps 0xc0(%rbp), %xmm14
1340 movaps 0xd0(%rbp), %xmm15
1341 lea 0xa0(%rbp), %rsp
1343 $code.=<<___;
1344 mov 0x48(%rsp), %r15
1345 mov 0x50(%rsp), %r14
1346 mov 0x58(%rsp), %r13
1347 mov 0x60(%rsp), %r12
1348 mov 0x68(%rsp), %rbx
1349 mov 0x70(%rsp), %rax
1350 lea 0x78(%rsp), %rsp
1351 mov %rax, %rbp
1352 .Lecb_enc_epilogue:
1354 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1356 .globl bsaes_ecb_decrypt_blocks
1357 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1358 .align 16
1359 bsaes_ecb_decrypt_blocks:
1360 mov %rsp, %rax
1361 .Lecb_dec_prologue:
1362 push %rbp
1363 push %rbx
1364 push %r12
1365 push %r13
1366 push %r14
1367 push %r15
1368 lea -0x48(%rsp),%rsp
1370 $code.=<<___ if ($win64);
1371 lea -0xa0(%rsp), %rsp
1372 movaps %xmm6, 0x40(%rsp)
1373 movaps %xmm7, 0x50(%rsp)
1374 movaps %xmm8, 0x60(%rsp)
1375 movaps %xmm9, 0x70(%rsp)
1376 movaps %xmm10, 0x80(%rsp)
1377 movaps %xmm11, 0x90(%rsp)
1378 movaps %xmm12, 0xa0(%rsp)
1379 movaps %xmm13, 0xb0(%rsp)
1380 movaps %xmm14, 0xc0(%rsp)
1381 movaps %xmm15, 0xd0(%rsp)
1382 .Lecb_dec_body:
1384 $code.=<<___;
1385 mov %rsp,%rbp # backup %rsp
1386 mov 240($arg4),%eax # rounds
1387 mov $arg1,$inp # backup arguments
1388 mov $arg2,$out
1389 mov $arg3,$len
1390 mov $arg4,$key
1391 cmp \$8,$arg3
1392 jb .Lecb_dec_short
1394 mov %eax,%ebx # backup rounds
1395 shl \$7,%rax # 128 bytes per inner round key
1396 sub \$`128-32`,%rax # size of bit-sliced key schedule
1397 sub %rax,%rsp
1398 mov %rsp,%rax # pass key schedule
1399 mov $key,%rcx # pass key
1400 mov %ebx,%r10d # pass rounds
1401 call _bsaes_key_convert
1402 pxor (%rsp),%xmm7 # fix up 0 round key
1403 movdqa %xmm6,(%rax) # save last round key
1404 movdqa %xmm7,(%rsp)
1406 sub \$8,$len
1407 .Lecb_dec_loop:
1408 movdqu 0x00($inp), @XMM[0] # load input
1409 movdqu 0x10($inp), @XMM[1]
1410 movdqu 0x20($inp), @XMM[2]
1411 movdqu 0x30($inp), @XMM[3]
1412 movdqu 0x40($inp), @XMM[4]
1413 movdqu 0x50($inp), @XMM[5]
1414 mov %rsp, %rax # pass key schedule
1415 movdqu 0x60($inp), @XMM[6]
1416 mov %ebx,%r10d # pass rounds
1417 movdqu 0x70($inp), @XMM[7]
1418 lea 0x80($inp), $inp
1420 call _bsaes_decrypt8
1422 movdqu @XMM[0], 0x00($out) # write output
1423 movdqu @XMM[1], 0x10($out)
1424 movdqu @XMM[6], 0x20($out)
1425 movdqu @XMM[4], 0x30($out)
1426 movdqu @XMM[2], 0x40($out)
1427 movdqu @XMM[7], 0x50($out)
1428 movdqu @XMM[3], 0x60($out)
1429 movdqu @XMM[5], 0x70($out)
1430 lea 0x80($out), $out
1431 sub \$8,$len
1432 jnc .Lecb_dec_loop
1434 add \$8,$len
1435 jz .Lecb_dec_done
1437 movdqu 0x00($inp), @XMM[0] # load input
1438 mov %rsp, %rax # pass key schedule
1439 mov %ebx,%r10d # pass rounds
1440 cmp \$2,$len
1441 jb .Lecb_dec_one
1442 movdqu 0x10($inp), @XMM[1]
1443 je .Lecb_dec_two
1444 movdqu 0x20($inp), @XMM[2]
1445 cmp \$4,$len
1446 jb .Lecb_dec_three
1447 movdqu 0x30($inp), @XMM[3]
1448 je .Lecb_dec_four
1449 movdqu 0x40($inp), @XMM[4]
1450 cmp \$6,$len
1451 jb .Lecb_dec_five
1452 movdqu 0x50($inp), @XMM[5]
1453 je .Lecb_dec_six
1454 movdqu 0x60($inp), @XMM[6]
1455 call _bsaes_decrypt8
1456 movdqu @XMM[0], 0x00($out) # write output
1457 movdqu @XMM[1], 0x10($out)
1458 movdqu @XMM[6], 0x20($out)
1459 movdqu @XMM[4], 0x30($out)
1460 movdqu @XMM[2], 0x40($out)
1461 movdqu @XMM[7], 0x50($out)
1462 movdqu @XMM[3], 0x60($out)
1463 jmp .Lecb_dec_done
1464 .align 16
1465 .Lecb_dec_six:
1466 call _bsaes_decrypt8
1467 movdqu @XMM[0], 0x00($out) # write output
1468 movdqu @XMM[1], 0x10($out)
1469 movdqu @XMM[6], 0x20($out)
1470 movdqu @XMM[4], 0x30($out)
1471 movdqu @XMM[2], 0x40($out)
1472 movdqu @XMM[7], 0x50($out)
1473 jmp .Lecb_dec_done
1474 .align 16
1475 .Lecb_dec_five:
1476 call _bsaes_decrypt8
1477 movdqu @XMM[0], 0x00($out) # write output
1478 movdqu @XMM[1], 0x10($out)
1479 movdqu @XMM[6], 0x20($out)
1480 movdqu @XMM[4], 0x30($out)
1481 movdqu @XMM[2], 0x40($out)
1482 jmp .Lecb_dec_done
1483 .align 16
1484 .Lecb_dec_four:
1485 call _bsaes_decrypt8
1486 movdqu @XMM[0], 0x00($out) # write output
1487 movdqu @XMM[1], 0x10($out)
1488 movdqu @XMM[6], 0x20($out)
1489 movdqu @XMM[4], 0x30($out)
1490 jmp .Lecb_dec_done
1491 .align 16
1492 .Lecb_dec_three:
1493 call _bsaes_decrypt8
1494 movdqu @XMM[0], 0x00($out) # write output
1495 movdqu @XMM[1], 0x10($out)
1496 movdqu @XMM[6], 0x20($out)
1497 jmp .Lecb_dec_done
1498 .align 16
1499 .Lecb_dec_two:
1500 call _bsaes_decrypt8
1501 movdqu @XMM[0], 0x00($out) # write output
1502 movdqu @XMM[1], 0x10($out)
1503 jmp .Lecb_dec_done
1504 .align 16
1505 .Lecb_dec_one:
1506 call _bsaes_decrypt8
1507 movdqu @XMM[0], 0x00($out) # write output
1508 jmp .Lecb_dec_done
1509 .align 16
1510 .Lecb_dec_short:
1511 lea ($inp), $arg1
1512 lea ($out), $arg2
1513 lea ($key), $arg3
1514 call asm_AES_decrypt
1515 lea 16($inp), $inp
1516 lea 16($out), $out
1517 dec $len
1518 jnz .Lecb_dec_short
1520 .Lecb_dec_done:
1521 lea (%rsp),%rax
1522 pxor %xmm0, %xmm0
1523 .Lecb_dec_bzero: # wipe key schedule [if any]
1524 movdqa %xmm0, 0x00(%rax)
1525 movdqa %xmm0, 0x10(%rax)
1526 lea 0x20(%rax), %rax
1527 cmp %rax, %rbp
1528 jb .Lecb_dec_bzero
1530 lea (%rbp),%rsp # restore %rsp
1532 $code.=<<___ if ($win64);
1533 movaps 0x40(%rbp), %xmm6
1534 movaps 0x50(%rbp), %xmm7
1535 movaps 0x60(%rbp), %xmm8
1536 movaps 0x70(%rbp), %xmm9
1537 movaps 0x80(%rbp), %xmm10
1538 movaps 0x90(%rbp), %xmm11
1539 movaps 0xa0(%rbp), %xmm12
1540 movaps 0xb0(%rbp), %xmm13
1541 movaps 0xc0(%rbp), %xmm14
1542 movaps 0xd0(%rbp), %xmm15
1543 lea 0xa0(%rbp), %rsp
1545 $code.=<<___;
1546 mov 0x48(%rsp), %r15
1547 mov 0x50(%rsp), %r14
1548 mov 0x58(%rsp), %r13
1549 mov 0x60(%rsp), %r12
1550 mov 0x68(%rsp), %rbx
1551 mov 0x70(%rsp), %rax
1552 lea 0x78(%rsp), %rsp
1553 mov %rax, %rbp
1554 .Lecb_dec_epilogue:
1556 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1559 $code.=<<___;
1560 .extern asm_AES_cbc_encrypt
1561 .globl bsaes_cbc_encrypt
1562 .type bsaes_cbc_encrypt,\@abi-omnipotent
1563 .align 16
1564 bsaes_cbc_encrypt:
1566 $code.=<<___ if ($win64);
1567 mov 48(%rsp),$arg6 # pull direction flag
1569 $code.=<<___;
1570 cmp \$0,$arg6
1571 jne asm_AES_cbc_encrypt
1572 cmp \$128,$arg3
1573 jb asm_AES_cbc_encrypt
1575 mov %rsp, %rax
1576 .Lcbc_dec_prologue:
1577 push %rbp
1578 push %rbx
1579 push %r12
1580 push %r13
1581 push %r14
1582 push %r15
1583 lea -0x48(%rsp), %rsp
1585 $code.=<<___ if ($win64);
1586 mov 0xa0(%rsp),$arg5 # pull ivp
1587 lea -0xa0(%rsp), %rsp
1588 movaps %xmm6, 0x40(%rsp)
1589 movaps %xmm7, 0x50(%rsp)
1590 movaps %xmm8, 0x60(%rsp)
1591 movaps %xmm9, 0x70(%rsp)
1592 movaps %xmm10, 0x80(%rsp)
1593 movaps %xmm11, 0x90(%rsp)
1594 movaps %xmm12, 0xa0(%rsp)
1595 movaps %xmm13, 0xb0(%rsp)
1596 movaps %xmm14, 0xc0(%rsp)
1597 movaps %xmm15, 0xd0(%rsp)
1598 .Lcbc_dec_body:
1600 $code.=<<___;
1601 mov %rsp, %rbp # backup %rsp
1602 mov 240($arg4), %eax # rounds
1603 mov $arg1, $inp # backup arguments
1604 mov $arg2, $out
1605 mov $arg3, $len
1606 mov $arg4, $key
1607 mov $arg5, %rbx
1608 shr \$4, $len # bytes to blocks
1610 mov %eax, %edx # rounds
1611 shl \$7, %rax # 128 bytes per inner round key
1612 sub \$`128-32`, %rax # size of bit-sliced key schedule
1613 sub %rax, %rsp
1615 mov %rsp, %rax # pass key schedule
1616 mov $key, %rcx # pass key
1617 mov %edx, %r10d # pass rounds
1618 call _bsaes_key_convert
1619 pxor (%rsp),%xmm7 # fix up 0 round key
1620 movdqa %xmm6,(%rax) # save last round key
1621 movdqa %xmm7,(%rsp)
1623 movdqu (%rbx), @XMM[15] # load IV
1624 sub \$8,$len
1625 .Lcbc_dec_loop:
1626 movdqu 0x00($inp), @XMM[0] # load input
1627 movdqu 0x10($inp), @XMM[1]
1628 movdqu 0x20($inp), @XMM[2]
1629 movdqu 0x30($inp), @XMM[3]
1630 movdqu 0x40($inp), @XMM[4]
1631 movdqu 0x50($inp), @XMM[5]
1632 mov %rsp, %rax # pass key schedule
1633 movdqu 0x60($inp), @XMM[6]
1634 mov %edx,%r10d # pass rounds
1635 movdqu 0x70($inp), @XMM[7]
1636 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1638 call _bsaes_decrypt8
1640 pxor 0x20(%rbp), @XMM[0] # ^= IV
1641 movdqu 0x00($inp), @XMM[8] # re-load input
1642 movdqu 0x10($inp), @XMM[9]
1643 pxor @XMM[8], @XMM[1]
1644 movdqu 0x20($inp), @XMM[10]
1645 pxor @XMM[9], @XMM[6]
1646 movdqu 0x30($inp), @XMM[11]
1647 pxor @XMM[10], @XMM[4]
1648 movdqu 0x40($inp), @XMM[12]
1649 pxor @XMM[11], @XMM[2]
1650 movdqu 0x50($inp), @XMM[13]
1651 pxor @XMM[12], @XMM[7]
1652 movdqu 0x60($inp), @XMM[14]
1653 pxor @XMM[13], @XMM[3]
1654 movdqu 0x70($inp), @XMM[15] # IV
1655 pxor @XMM[14], @XMM[5]
1656 movdqu @XMM[0], 0x00($out) # write output
1657 lea 0x80($inp), $inp
1658 movdqu @XMM[1], 0x10($out)
1659 movdqu @XMM[6], 0x20($out)
1660 movdqu @XMM[4], 0x30($out)
1661 movdqu @XMM[2], 0x40($out)
1662 movdqu @XMM[7], 0x50($out)
1663 movdqu @XMM[3], 0x60($out)
1664 movdqu @XMM[5], 0x70($out)
1665 lea 0x80($out), $out
1666 sub \$8,$len
1667 jnc .Lcbc_dec_loop
1669 add \$8,$len
1670 jz .Lcbc_dec_done
1672 movdqu 0x00($inp), @XMM[0] # load input
1673 mov %rsp, %rax # pass key schedule
1674 mov %edx, %r10d # pass rounds
1675 cmp \$2,$len
1676 jb .Lcbc_dec_one
1677 movdqu 0x10($inp), @XMM[1]
1678 je .Lcbc_dec_two
1679 movdqu 0x20($inp), @XMM[2]
1680 cmp \$4,$len
1681 jb .Lcbc_dec_three
1682 movdqu 0x30($inp), @XMM[3]
1683 je .Lcbc_dec_four
1684 movdqu 0x40($inp), @XMM[4]
1685 cmp \$6,$len
1686 jb .Lcbc_dec_five
1687 movdqu 0x50($inp), @XMM[5]
1688 je .Lcbc_dec_six
1689 movdqu 0x60($inp), @XMM[6]
1690 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1691 call _bsaes_decrypt8
1692 pxor 0x20(%rbp), @XMM[0] # ^= IV
1693 movdqu 0x00($inp), @XMM[8] # re-load input
1694 movdqu 0x10($inp), @XMM[9]
1695 pxor @XMM[8], @XMM[1]
1696 movdqu 0x20($inp), @XMM[10]
1697 pxor @XMM[9], @XMM[6]
1698 movdqu 0x30($inp), @XMM[11]
1699 pxor @XMM[10], @XMM[4]
1700 movdqu 0x40($inp), @XMM[12]
1701 pxor @XMM[11], @XMM[2]
1702 movdqu 0x50($inp), @XMM[13]
1703 pxor @XMM[12], @XMM[7]
1704 movdqu 0x60($inp), @XMM[15] # IV
1705 pxor @XMM[13], @XMM[3]
1706 movdqu @XMM[0], 0x00($out) # write output
1707 movdqu @XMM[1], 0x10($out)
1708 movdqu @XMM[6], 0x20($out)
1709 movdqu @XMM[4], 0x30($out)
1710 movdqu @XMM[2], 0x40($out)
1711 movdqu @XMM[7], 0x50($out)
1712 movdqu @XMM[3], 0x60($out)
1713 jmp .Lcbc_dec_done
1714 .align 16
1715 .Lcbc_dec_six:
1716 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1717 call _bsaes_decrypt8
1718 pxor 0x20(%rbp), @XMM[0] # ^= IV
1719 movdqu 0x00($inp), @XMM[8] # re-load input
1720 movdqu 0x10($inp), @XMM[9]
1721 pxor @XMM[8], @XMM[1]
1722 movdqu 0x20($inp), @XMM[10]
1723 pxor @XMM[9], @XMM[6]
1724 movdqu 0x30($inp), @XMM[11]
1725 pxor @XMM[10], @XMM[4]
1726 movdqu 0x40($inp), @XMM[12]
1727 pxor @XMM[11], @XMM[2]
1728 movdqu 0x50($inp), @XMM[15] # IV
1729 pxor @XMM[12], @XMM[7]
1730 movdqu @XMM[0], 0x00($out) # write output
1731 movdqu @XMM[1], 0x10($out)
1732 movdqu @XMM[6], 0x20($out)
1733 movdqu @XMM[4], 0x30($out)
1734 movdqu @XMM[2], 0x40($out)
1735 movdqu @XMM[7], 0x50($out)
1736 jmp .Lcbc_dec_done
1737 .align 16
1738 .Lcbc_dec_five:
1739 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1740 call _bsaes_decrypt8
1741 pxor 0x20(%rbp), @XMM[0] # ^= IV
1742 movdqu 0x00($inp), @XMM[8] # re-load input
1743 movdqu 0x10($inp), @XMM[9]
1744 pxor @XMM[8], @XMM[1]
1745 movdqu 0x20($inp), @XMM[10]
1746 pxor @XMM[9], @XMM[6]
1747 movdqu 0x30($inp), @XMM[11]
1748 pxor @XMM[10], @XMM[4]
1749 movdqu 0x40($inp), @XMM[15] # IV
1750 pxor @XMM[11], @XMM[2]
1751 movdqu @XMM[0], 0x00($out) # write output
1752 movdqu @XMM[1], 0x10($out)
1753 movdqu @XMM[6], 0x20($out)
1754 movdqu @XMM[4], 0x30($out)
1755 movdqu @XMM[2], 0x40($out)
1756 jmp .Lcbc_dec_done
1757 .align 16
1758 .Lcbc_dec_four:
1759 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1760 call _bsaes_decrypt8
1761 pxor 0x20(%rbp), @XMM[0] # ^= IV
1762 movdqu 0x00($inp), @XMM[8] # re-load input
1763 movdqu 0x10($inp), @XMM[9]
1764 pxor @XMM[8], @XMM[1]
1765 movdqu 0x20($inp), @XMM[10]
1766 pxor @XMM[9], @XMM[6]
1767 movdqu 0x30($inp), @XMM[15] # IV
1768 pxor @XMM[10], @XMM[4]
1769 movdqu @XMM[0], 0x00($out) # write output
1770 movdqu @XMM[1], 0x10($out)
1771 movdqu @XMM[6], 0x20($out)
1772 movdqu @XMM[4], 0x30($out)
1773 jmp .Lcbc_dec_done
1774 .align 16
1775 .Lcbc_dec_three:
1776 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1777 call _bsaes_decrypt8
1778 pxor 0x20(%rbp), @XMM[0] # ^= IV
1779 movdqu 0x00($inp), @XMM[8] # re-load input
1780 movdqu 0x10($inp), @XMM[9]
1781 pxor @XMM[8], @XMM[1]
1782 movdqu 0x20($inp), @XMM[15] # IV
1783 pxor @XMM[9], @XMM[6]
1784 movdqu @XMM[0], 0x00($out) # write output
1785 movdqu @XMM[1], 0x10($out)
1786 movdqu @XMM[6], 0x20($out)
1787 jmp .Lcbc_dec_done
1788 .align 16
1789 .Lcbc_dec_two:
1790 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1791 call _bsaes_decrypt8
1792 pxor 0x20(%rbp), @XMM[0] # ^= IV
1793 movdqu 0x00($inp), @XMM[8] # re-load input
1794 movdqu 0x10($inp), @XMM[15] # IV
1795 pxor @XMM[8], @XMM[1]
1796 movdqu @XMM[0], 0x00($out) # write output
1797 movdqu @XMM[1], 0x10($out)
1798 jmp .Lcbc_dec_done
1799 .align 16
1800 .Lcbc_dec_one:
1801 lea ($inp), $arg1
1802 lea 0x20(%rbp), $arg2 # buffer output
1803 lea ($key), $arg3
1804 call asm_AES_decrypt # doesn't touch %xmm
1805 pxor 0x20(%rbp), @XMM[15] # ^= IV
1806 movdqu @XMM[15], ($out) # write output
1807 movdqa @XMM[0], @XMM[15] # IV
1809 .Lcbc_dec_done:
1810 movdqu @XMM[15], (%rbx) # return IV
1811 lea (%rsp), %rax
1812 pxor %xmm0, %xmm0
1813 .Lcbc_dec_bzero: # wipe key schedule [if any]
1814 movdqa %xmm0, 0x00(%rax)
1815 movdqa %xmm0, 0x10(%rax)
1816 lea 0x20(%rax), %rax
1817 cmp %rax, %rbp
1818 ja .Lcbc_dec_bzero
1820 lea (%rbp),%rsp # restore %rsp
1822 $code.=<<___ if ($win64);
1823 movaps 0x40(%rbp), %xmm6
1824 movaps 0x50(%rbp), %xmm7
1825 movaps 0x60(%rbp), %xmm8
1826 movaps 0x70(%rbp), %xmm9
1827 movaps 0x80(%rbp), %xmm10
1828 movaps 0x90(%rbp), %xmm11
1829 movaps 0xa0(%rbp), %xmm12
1830 movaps 0xb0(%rbp), %xmm13
1831 movaps 0xc0(%rbp), %xmm14
1832 movaps 0xd0(%rbp), %xmm15
1833 lea 0xa0(%rbp), %rsp
1835 $code.=<<___;
1836 mov 0x48(%rsp), %r15
1837 mov 0x50(%rsp), %r14
1838 mov 0x58(%rsp), %r13
1839 mov 0x60(%rsp), %r12
1840 mov 0x68(%rsp), %rbx
1841 mov 0x70(%rsp), %rax
1842 lea 0x78(%rsp), %rsp
1843 mov %rax, %rbp
1844 .Lcbc_dec_epilogue:
1846 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1848 .globl bsaes_ctr32_encrypt_blocks
1849 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1850 .align 16
1851 bsaes_ctr32_encrypt_blocks:
1852 mov %rsp, %rax
1853 .Lctr_enc_prologue:
1854 push %rbp
1855 push %rbx
1856 push %r12
1857 push %r13
1858 push %r14
1859 push %r15
1860 lea -0x48(%rsp), %rsp
1862 $code.=<<___ if ($win64);
1863 mov 0xa0(%rsp),$arg5 # pull ivp
1864 lea -0xa0(%rsp), %rsp
1865 movaps %xmm6, 0x40(%rsp)
1866 movaps %xmm7, 0x50(%rsp)
1867 movaps %xmm8, 0x60(%rsp)
1868 movaps %xmm9, 0x70(%rsp)
1869 movaps %xmm10, 0x80(%rsp)
1870 movaps %xmm11, 0x90(%rsp)
1871 movaps %xmm12, 0xa0(%rsp)
1872 movaps %xmm13, 0xb0(%rsp)
1873 movaps %xmm14, 0xc0(%rsp)
1874 movaps %xmm15, 0xd0(%rsp)
1875 .Lctr_enc_body:
1877 $code.=<<___;
1878 mov %rsp, %rbp # backup %rsp
1879 movdqu ($arg5), %xmm0 # load counter
1880 mov 240($arg4), %eax # rounds
1881 mov $arg1, $inp # backup arguments
1882 mov $arg2, $out
1883 mov $arg3, $len
1884 mov $arg4, $key
1885 movdqa %xmm0, 0x20(%rbp) # copy counter
1886 cmp \$8, $arg3
1887 jb .Lctr_enc_short
1889 mov %eax, %ebx # rounds
1890 shl \$7, %rax # 128 bytes per inner round key
1891 sub \$`128-32`, %rax # size of bit-sliced key schedule
1892 sub %rax, %rsp
1894 mov %rsp, %rax # pass key schedule
1895 mov $key, %rcx # pass key
1896 mov %ebx, %r10d # pass rounds
1897 call _bsaes_key_convert
1898 pxor %xmm6,%xmm7 # fix up last round key
1899 movdqa %xmm7,(%rax) # save last round key
1901 movdqa (%rsp), @XMM[9] # load round0 key
1902 lea .LADD1(%rip), %r11
1903 movdqa 0x20(%rbp), @XMM[0] # counter copy
1904 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1905 pshufb @XMM[8], @XMM[9] # byte swap upper part
1906 pshufb @XMM[8], @XMM[0]
1907 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1908 jmp .Lctr_enc_loop
1909 .align 16
1910 .Lctr_enc_loop:
1911 movdqa @XMM[0], 0x20(%rbp) # save counter
1912 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1913 movdqa @XMM[0], @XMM[2]
1914 paddd 0x00(%r11), @XMM[1] # .LADD1
1915 movdqa @XMM[0], @XMM[3]
1916 paddd 0x10(%r11), @XMM[2] # .LADD2
1917 movdqa @XMM[0], @XMM[4]
1918 paddd 0x20(%r11), @XMM[3] # .LADD3
1919 movdqa @XMM[0], @XMM[5]
1920 paddd 0x30(%r11), @XMM[4] # .LADD4
1921 movdqa @XMM[0], @XMM[6]
1922 paddd 0x40(%r11), @XMM[5] # .LADD5
1923 movdqa @XMM[0], @XMM[7]
1924 paddd 0x50(%r11), @XMM[6] # .LADD6
1925 paddd 0x60(%r11), @XMM[7] # .LADD7
1927 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1928 # to flip byte order in 32-bit counter
1929 movdqa (%rsp), @XMM[9] # round 0 key
1930 lea 0x10(%rsp), %rax # pass key schedule
1931 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1932 pxor @XMM[9], @XMM[0] # xor with round0 key
1933 pxor @XMM[9], @XMM[1]
1934 pxor @XMM[9], @XMM[2]
1935 pxor @XMM[9], @XMM[3]
1936 pshufb @XMM[8], @XMM[0]
1937 pshufb @XMM[8], @XMM[1]
1938 pxor @XMM[9], @XMM[4]
1939 pxor @XMM[9], @XMM[5]
1940 pshufb @XMM[8], @XMM[2]
1941 pshufb @XMM[8], @XMM[3]
1942 pxor @XMM[9], @XMM[6]
1943 pxor @XMM[9], @XMM[7]
1944 pshufb @XMM[8], @XMM[4]
1945 pshufb @XMM[8], @XMM[5]
1946 pshufb @XMM[8], @XMM[6]
1947 pshufb @XMM[8], @XMM[7]
1948 lea .LBS0(%rip), %r11 # constants table
1949 mov %ebx,%r10d # pass rounds
1951 call _bsaes_encrypt8_bitslice
1953 sub \$8,$len
1954 jc .Lctr_enc_loop_done
1956 movdqu 0x00($inp), @XMM[8] # load input
1957 movdqu 0x10($inp), @XMM[9]
1958 movdqu 0x20($inp), @XMM[10]
1959 movdqu 0x30($inp), @XMM[11]
1960 movdqu 0x40($inp), @XMM[12]
1961 movdqu 0x50($inp), @XMM[13]
1962 movdqu 0x60($inp), @XMM[14]
1963 movdqu 0x70($inp), @XMM[15]
1964 lea 0x80($inp),$inp
1965 pxor @XMM[0], @XMM[8]
1966 movdqa 0x20(%rbp), @XMM[0] # load counter
1967 pxor @XMM[9], @XMM[1]
1968 movdqu @XMM[8], 0x00($out) # write output
1969 pxor @XMM[10], @XMM[4]
1970 movdqu @XMM[1], 0x10($out)
1971 pxor @XMM[11], @XMM[6]
1972 movdqu @XMM[4], 0x20($out)
1973 pxor @XMM[12], @XMM[3]
1974 movdqu @XMM[6], 0x30($out)
1975 pxor @XMM[13], @XMM[7]
1976 movdqu @XMM[3], 0x40($out)
1977 pxor @XMM[14], @XMM[2]
1978 movdqu @XMM[7], 0x50($out)
1979 pxor @XMM[15], @XMM[5]
1980 movdqu @XMM[2], 0x60($out)
1981 lea .LADD1(%rip), %r11
1982 movdqu @XMM[5], 0x70($out)
1983 lea 0x80($out), $out
1984 paddd 0x70(%r11), @XMM[0] # .LADD8
1985 jnz .Lctr_enc_loop
1987 jmp .Lctr_enc_done
1988 .align 16
1989 .Lctr_enc_loop_done:
1990 add \$8, $len
1991 movdqu 0x00($inp), @XMM[8] # load input
1992 pxor @XMM[8], @XMM[0]
1993 movdqu @XMM[0], 0x00($out) # write output
1994 cmp \$2,$len
1995 jb .Lctr_enc_done
1996 movdqu 0x10($inp), @XMM[9]
1997 pxor @XMM[9], @XMM[1]
1998 movdqu @XMM[1], 0x10($out)
1999 je .Lctr_enc_done
2000 movdqu 0x20($inp), @XMM[10]
2001 pxor @XMM[10], @XMM[4]
2002 movdqu @XMM[4], 0x20($out)
2003 cmp \$4,$len
2004 jb .Lctr_enc_done
2005 movdqu 0x30($inp), @XMM[11]
2006 pxor @XMM[11], @XMM[6]
2007 movdqu @XMM[6], 0x30($out)
2008 je .Lctr_enc_done
2009 movdqu 0x40($inp), @XMM[12]
2010 pxor @XMM[12], @XMM[3]
2011 movdqu @XMM[3], 0x40($out)
2012 cmp \$6,$len
2013 jb .Lctr_enc_done
2014 movdqu 0x50($inp), @XMM[13]
2015 pxor @XMM[13], @XMM[7]
2016 movdqu @XMM[7], 0x50($out)
2017 je .Lctr_enc_done
2018 movdqu 0x60($inp), @XMM[14]
2019 pxor @XMM[14], @XMM[2]
2020 movdqu @XMM[2], 0x60($out)
2021 jmp .Lctr_enc_done
2023 .align 16
2024 .Lctr_enc_short:
2025 lea 0x20(%rbp), $arg1
2026 lea 0x30(%rbp), $arg2
2027 lea ($key), $arg3
2028 call asm_AES_encrypt
2029 movdqu ($inp), @XMM[1]
2030 lea 16($inp), $inp
2031 mov 0x2c(%rbp), %eax # load 32-bit counter
2032 bswap %eax
2033 pxor 0x30(%rbp), @XMM[1]
2034 inc %eax # increment
2035 movdqu @XMM[1], ($out)
2036 bswap %eax
2037 lea 16($out), $out
2038 mov %eax, 0x2c(%rsp) # save 32-bit counter
2039 dec $len
2040 jnz .Lctr_enc_short
2042 .Lctr_enc_done:
2043 lea (%rsp), %rax
2044 pxor %xmm0, %xmm0
2045 .Lctr_enc_bzero: # wipe key schedule [if any]
2046 movdqa %xmm0, 0x00(%rax)
2047 movdqa %xmm0, 0x10(%rax)
2048 lea 0x20(%rax), %rax
2049 cmp %rax, %rbp
2050 ja .Lctr_enc_bzero
2052 lea (%rbp),%rsp # restore %rsp
2054 $code.=<<___ if ($win64);
2055 movaps 0x40(%rbp), %xmm6
2056 movaps 0x50(%rbp), %xmm7
2057 movaps 0x60(%rbp), %xmm8
2058 movaps 0x70(%rbp), %xmm9
2059 movaps 0x80(%rbp), %xmm10
2060 movaps 0x90(%rbp), %xmm11
2061 movaps 0xa0(%rbp), %xmm12
2062 movaps 0xb0(%rbp), %xmm13
2063 movaps 0xc0(%rbp), %xmm14
2064 movaps 0xd0(%rbp), %xmm15
2065 lea 0xa0(%rbp), %rsp
2067 $code.=<<___;
2068 mov 0x48(%rsp), %r15
2069 mov 0x50(%rsp), %r14
2070 mov 0x58(%rsp), %r13
2071 mov 0x60(%rsp), %r12
2072 mov 0x68(%rsp), %rbx
2073 mov 0x70(%rsp), %rax
2074 lea 0x78(%rsp), %rsp
2075 mov %rax, %rbp
2076 .Lctr_enc_epilogue:
2078 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2080 ######################################################################
2081 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2082 # const AES_KEY *key1, const AES_KEY *key2,
2083 # const unsigned char iv[16]);
2085 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2086 $arg6=~s/d$//;
2088 $code.=<<___;
2089 .globl bsaes_xts_encrypt
2090 .type bsaes_xts_encrypt,\@abi-omnipotent
2091 .align 16
2092 bsaes_xts_encrypt:
2093 mov %rsp, %rax
2094 .Lxts_enc_prologue:
2095 push %rbp
2096 push %rbx
2097 push %r12
2098 push %r13
2099 push %r14
2100 push %r15
2101 lea -0x48(%rsp), %rsp
2103 $code.=<<___ if ($win64);
2104 mov 0xa0(%rsp),$arg5 # pull key2
2105 mov 0xa8(%rsp),$arg6 # pull ivp
2106 lea -0xa0(%rsp), %rsp
2107 movaps %xmm6, 0x40(%rsp)
2108 movaps %xmm7, 0x50(%rsp)
2109 movaps %xmm8, 0x60(%rsp)
2110 movaps %xmm9, 0x70(%rsp)
2111 movaps %xmm10, 0x80(%rsp)
2112 movaps %xmm11, 0x90(%rsp)
2113 movaps %xmm12, 0xa0(%rsp)
2114 movaps %xmm13, 0xb0(%rsp)
2115 movaps %xmm14, 0xc0(%rsp)
2116 movaps %xmm15, 0xd0(%rsp)
2117 .Lxts_enc_body:
2119 $code.=<<___;
2120 mov %rsp, %rbp # backup %rsp
2121 mov $arg1, $inp # backup arguments
2122 mov $arg2, $out
2123 mov $arg3, $len
2124 mov $arg4, $key
2126 lea ($arg6), $arg1
2127 lea 0x20(%rbp), $arg2
2128 lea ($arg5), $arg3
2129 call asm_AES_encrypt # generate initial tweak
2131 mov 240($key), %eax # rounds
2132 mov $len, %rbx # backup $len
2134 mov %eax, %edx # rounds
2135 shl \$7, %rax # 128 bytes per inner round key
2136 sub \$`128-32`, %rax # size of bit-sliced key schedule
2137 sub %rax, %rsp
2139 mov %rsp, %rax # pass key schedule
2140 mov $key, %rcx # pass key
2141 mov %edx, %r10d # pass rounds
2142 call _bsaes_key_convert
2143 pxor %xmm6, %xmm7 # fix up last round key
2144 movdqa %xmm7, (%rax) # save last round key
2146 and \$-16, $len
2147 sub \$0x80, %rsp # place for tweak[8]
2148 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2150 pxor $twtmp, $twtmp
2151 movdqa .Lxts_magic(%rip), $twmask
2152 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2154 sub \$0x80, $len
2155 jc .Lxts_enc_short
2156 jmp .Lxts_enc_loop
2158 .align 16
2159 .Lxts_enc_loop:
2161 for ($i=0;$i<7;$i++) {
2162 $code.=<<___;
2163 pshufd \$0x13, $twtmp, $twres
2164 pxor $twtmp, $twtmp
2165 movdqa @XMM[7], @XMM[$i]
2166 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2167 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2168 pand $twmask, $twres # isolate carry and residue
2169 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2170 pxor $twres, @XMM[7]
2172 $code.=<<___ if ($i>=1);
2173 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2175 $code.=<<___ if ($i>=2);
2176 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2179 $code.=<<___;
2180 movdqu 0x60($inp), @XMM[8+6]
2181 pxor @XMM[8+5], @XMM[5]
2182 movdqu 0x70($inp), @XMM[8+7]
2183 lea 0x80($inp), $inp
2184 movdqa @XMM[7], 0x70(%rsp)
2185 pxor @XMM[8+6], @XMM[6]
2186 lea 0x80(%rsp), %rax # pass key schedule
2187 pxor @XMM[8+7], @XMM[7]
2188 mov %edx, %r10d # pass rounds
2190 call _bsaes_encrypt8
2192 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2193 pxor 0x10(%rsp), @XMM[1]
2194 movdqu @XMM[0], 0x00($out) # write output
2195 pxor 0x20(%rsp), @XMM[4]
2196 movdqu @XMM[1], 0x10($out)
2197 pxor 0x30(%rsp), @XMM[6]
2198 movdqu @XMM[4], 0x20($out)
2199 pxor 0x40(%rsp), @XMM[3]
2200 movdqu @XMM[6], 0x30($out)
2201 pxor 0x50(%rsp), @XMM[7]
2202 movdqu @XMM[3], 0x40($out)
2203 pxor 0x60(%rsp), @XMM[2]
2204 movdqu @XMM[7], 0x50($out)
2205 pxor 0x70(%rsp), @XMM[5]
2206 movdqu @XMM[2], 0x60($out)
2207 movdqu @XMM[5], 0x70($out)
2208 lea 0x80($out), $out
2210 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2211 pxor $twtmp, $twtmp
2212 movdqa .Lxts_magic(%rip), $twmask
2213 pcmpgtd @XMM[7], $twtmp
2214 pshufd \$0x13, $twtmp, $twres
2215 pxor $twtmp, $twtmp
2216 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2217 pand $twmask, $twres # isolate carry and residue
2218 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2219 pxor $twres, @XMM[7]
2221 sub \$0x80,$len
2222 jnc .Lxts_enc_loop
2224 .Lxts_enc_short:
2225 add \$0x80, $len
2226 jz .Lxts_enc_done
2228 for ($i=0;$i<7;$i++) {
2229 $code.=<<___;
2230 pshufd \$0x13, $twtmp, $twres
2231 pxor $twtmp, $twtmp
2232 movdqa @XMM[7], @XMM[$i]
2233 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2234 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2235 pand $twmask, $twres # isolate carry and residue
2236 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2237 pxor $twres, @XMM[7]
2239 $code.=<<___ if ($i>=1);
2240 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2241 cmp \$`0x10*$i`,$len
2242 je .Lxts_enc_$i
2244 $code.=<<___ if ($i>=2);
2245 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2248 $code.=<<___;
2249 movdqu 0x60($inp), @XMM[8+6]
2250 pxor @XMM[8+5], @XMM[5]
2251 movdqa @XMM[7], 0x70(%rsp)
2252 lea 0x70($inp), $inp
2253 pxor @XMM[8+6], @XMM[6]
2254 lea 0x80(%rsp), %rax # pass key schedule
2255 mov %edx, %r10d # pass rounds
2257 call _bsaes_encrypt8
2259 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2260 pxor 0x10(%rsp), @XMM[1]
2261 movdqu @XMM[0], 0x00($out) # write output
2262 pxor 0x20(%rsp), @XMM[4]
2263 movdqu @XMM[1], 0x10($out)
2264 pxor 0x30(%rsp), @XMM[6]
2265 movdqu @XMM[4], 0x20($out)
2266 pxor 0x40(%rsp), @XMM[3]
2267 movdqu @XMM[6], 0x30($out)
2268 pxor 0x50(%rsp), @XMM[7]
2269 movdqu @XMM[3], 0x40($out)
2270 pxor 0x60(%rsp), @XMM[2]
2271 movdqu @XMM[7], 0x50($out)
2272 movdqu @XMM[2], 0x60($out)
2273 lea 0x70($out), $out
2275 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2276 jmp .Lxts_enc_done
2277 .align 16
2278 .Lxts_enc_6:
2279 pxor @XMM[8+4], @XMM[4]
2280 lea 0x60($inp), $inp
2281 pxor @XMM[8+5], @XMM[5]
2282 lea 0x80(%rsp), %rax # pass key schedule
2283 mov %edx, %r10d # pass rounds
2285 call _bsaes_encrypt8
2287 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2288 pxor 0x10(%rsp), @XMM[1]
2289 movdqu @XMM[0], 0x00($out) # write output
2290 pxor 0x20(%rsp), @XMM[4]
2291 movdqu @XMM[1], 0x10($out)
2292 pxor 0x30(%rsp), @XMM[6]
2293 movdqu @XMM[4], 0x20($out)
2294 pxor 0x40(%rsp), @XMM[3]
2295 movdqu @XMM[6], 0x30($out)
2296 pxor 0x50(%rsp), @XMM[7]
2297 movdqu @XMM[3], 0x40($out)
2298 movdqu @XMM[7], 0x50($out)
2299 lea 0x60($out), $out
2301 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2302 jmp .Lxts_enc_done
2303 .align 16
2304 .Lxts_enc_5:
2305 pxor @XMM[8+3], @XMM[3]
2306 lea 0x50($inp), $inp
2307 pxor @XMM[8+4], @XMM[4]
2308 lea 0x80(%rsp), %rax # pass key schedule
2309 mov %edx, %r10d # pass rounds
2311 call _bsaes_encrypt8
2313 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2314 pxor 0x10(%rsp), @XMM[1]
2315 movdqu @XMM[0], 0x00($out) # write output
2316 pxor 0x20(%rsp), @XMM[4]
2317 movdqu @XMM[1], 0x10($out)
2318 pxor 0x30(%rsp), @XMM[6]
2319 movdqu @XMM[4], 0x20($out)
2320 pxor 0x40(%rsp), @XMM[3]
2321 movdqu @XMM[6], 0x30($out)
2322 movdqu @XMM[3], 0x40($out)
2323 lea 0x50($out), $out
2325 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2326 jmp .Lxts_enc_done
2327 .align 16
2328 .Lxts_enc_4:
2329 pxor @XMM[8+2], @XMM[2]
2330 lea 0x40($inp), $inp
2331 pxor @XMM[8+3], @XMM[3]
2332 lea 0x80(%rsp), %rax # pass key schedule
2333 mov %edx, %r10d # pass rounds
2335 call _bsaes_encrypt8
2337 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2338 pxor 0x10(%rsp), @XMM[1]
2339 movdqu @XMM[0], 0x00($out) # write output
2340 pxor 0x20(%rsp), @XMM[4]
2341 movdqu @XMM[1], 0x10($out)
2342 pxor 0x30(%rsp), @XMM[6]
2343 movdqu @XMM[4], 0x20($out)
2344 movdqu @XMM[6], 0x30($out)
2345 lea 0x40($out), $out
2347 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2348 jmp .Lxts_enc_done
2349 .align 16
2350 .Lxts_enc_3:
2351 pxor @XMM[8+1], @XMM[1]
2352 lea 0x30($inp), $inp
2353 pxor @XMM[8+2], @XMM[2]
2354 lea 0x80(%rsp), %rax # pass key schedule
2355 mov %edx, %r10d # pass rounds
2357 call _bsaes_encrypt8
2359 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2360 pxor 0x10(%rsp), @XMM[1]
2361 movdqu @XMM[0], 0x00($out) # write output
2362 pxor 0x20(%rsp), @XMM[4]
2363 movdqu @XMM[1], 0x10($out)
2364 movdqu @XMM[4], 0x20($out)
2365 lea 0x30($out), $out
2367 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2368 jmp .Lxts_enc_done
2369 .align 16
2370 .Lxts_enc_2:
2371 pxor @XMM[8+0], @XMM[0]
2372 lea 0x20($inp), $inp
2373 pxor @XMM[8+1], @XMM[1]
2374 lea 0x80(%rsp), %rax # pass key schedule
2375 mov %edx, %r10d # pass rounds
2377 call _bsaes_encrypt8
2379 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2380 pxor 0x10(%rsp), @XMM[1]
2381 movdqu @XMM[0], 0x00($out) # write output
2382 movdqu @XMM[1], 0x10($out)
2383 lea 0x20($out), $out
2385 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2386 jmp .Lxts_enc_done
2387 .align 16
2388 .Lxts_enc_1:
2389 pxor @XMM[0], @XMM[8]
2390 lea 0x10($inp), $inp
2391 movdqa @XMM[8], 0x20(%rbp)
2392 lea 0x20(%rbp), $arg1
2393 lea 0x20(%rbp), $arg2
2394 lea ($key), $arg3
2395 call asm_AES_encrypt # doesn't touch %xmm
2396 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2397 #pxor @XMM[8], @XMM[0]
2398 #lea 0x80(%rsp), %rax # pass key schedule
2399 #mov %edx, %r10d # pass rounds
2400 #call _bsaes_encrypt8
2401 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2402 movdqu @XMM[0], 0x00($out) # write output
2403 lea 0x10($out), $out
2405 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2407 .Lxts_enc_done:
2408 and \$15, %ebx
2409 jz .Lxts_enc_ret
2410 mov $out, %rdx
2412 .Lxts_enc_steal:
2413 movzb ($inp), %eax
2414 movzb -16(%rdx), %ecx
2415 lea 1($inp), $inp
2416 mov %al, -16(%rdx)
2417 mov %cl, 0(%rdx)
2418 lea 1(%rdx), %rdx
2419 sub \$1,%ebx
2420 jnz .Lxts_enc_steal
2422 movdqu -16($out), @XMM[0]
2423 lea 0x20(%rbp), $arg1
2424 pxor @XMM[7], @XMM[0]
2425 lea 0x20(%rbp), $arg2
2426 movdqa @XMM[0], 0x20(%rbp)
2427 lea ($key), $arg3
2428 call asm_AES_encrypt # doesn't touch %xmm
2429 pxor 0x20(%rbp), @XMM[7]
2430 movdqu @XMM[7], -16($out)
2432 .Lxts_enc_ret:
2433 lea (%rsp), %rax
2434 pxor %xmm0, %xmm0
2435 .Lxts_enc_bzero: # wipe key schedule [if any]
2436 movdqa %xmm0, 0x00(%rax)
2437 movdqa %xmm0, 0x10(%rax)
2438 lea 0x20(%rax), %rax
2439 cmp %rax, %rbp
2440 ja .Lxts_enc_bzero
2442 lea (%rbp),%rsp # restore %rsp
2444 $code.=<<___ if ($win64);
2445 movaps 0x40(%rbp), %xmm6
2446 movaps 0x50(%rbp), %xmm7
2447 movaps 0x60(%rbp), %xmm8
2448 movaps 0x70(%rbp), %xmm9
2449 movaps 0x80(%rbp), %xmm10
2450 movaps 0x90(%rbp), %xmm11
2451 movaps 0xa0(%rbp), %xmm12
2452 movaps 0xb0(%rbp), %xmm13
2453 movaps 0xc0(%rbp), %xmm14
2454 movaps 0xd0(%rbp), %xmm15
2455 lea 0xa0(%rbp), %rsp
2457 $code.=<<___;
2458 mov 0x48(%rsp), %r15
2459 mov 0x50(%rsp), %r14
2460 mov 0x58(%rsp), %r13
2461 mov 0x60(%rsp), %r12
2462 mov 0x68(%rsp), %rbx
2463 mov 0x70(%rsp), %rax
2464 lea 0x78(%rsp), %rsp
2465 mov %rax, %rbp
2466 .Lxts_enc_epilogue:
2468 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2470 .globl bsaes_xts_decrypt
2471 .type bsaes_xts_decrypt,\@abi-omnipotent
2472 .align 16
2473 bsaes_xts_decrypt:
2474 mov %rsp, %rax
2475 .Lxts_dec_prologue:
2476 push %rbp
2477 push %rbx
2478 push %r12
2479 push %r13
2480 push %r14
2481 push %r15
2482 lea -0x48(%rsp), %rsp
2484 $code.=<<___ if ($win64);
2485 mov 0xa0(%rsp),$arg5 # pull key2
2486 mov 0xa8(%rsp),$arg6 # pull ivp
2487 lea -0xa0(%rsp), %rsp
2488 movaps %xmm6, 0x40(%rsp)
2489 movaps %xmm7, 0x50(%rsp)
2490 movaps %xmm8, 0x60(%rsp)
2491 movaps %xmm9, 0x70(%rsp)
2492 movaps %xmm10, 0x80(%rsp)
2493 movaps %xmm11, 0x90(%rsp)
2494 movaps %xmm12, 0xa0(%rsp)
2495 movaps %xmm13, 0xb0(%rsp)
2496 movaps %xmm14, 0xc0(%rsp)
2497 movaps %xmm15, 0xd0(%rsp)
2498 .Lxts_dec_body:
2500 $code.=<<___;
2501 mov %rsp, %rbp # backup %rsp
2502 mov $arg1, $inp # backup arguments
2503 mov $arg2, $out
2504 mov $arg3, $len
2505 mov $arg4, $key
2507 lea ($arg6), $arg1
2508 lea 0x20(%rbp), $arg2
2509 lea ($arg5), $arg3
2510 call asm_AES_encrypt # generate initial tweak
2512 mov 240($key), %eax # rounds
2513 mov $len, %rbx # backup $len
2515 mov %eax, %edx # rounds
2516 shl \$7, %rax # 128 bytes per inner round key
2517 sub \$`128-32`, %rax # size of bit-sliced key schedule
2518 sub %rax, %rsp
2520 mov %rsp, %rax # pass key schedule
2521 mov $key, %rcx # pass key
2522 mov %edx, %r10d # pass rounds
2523 call _bsaes_key_convert
2524 pxor (%rsp), %xmm7 # fix up round 0 key
2525 movdqa %xmm6, (%rax) # save last round key
2526 movdqa %xmm7, (%rsp)
2528 xor %eax, %eax # if ($len%16) len-=16;
2529 and \$-16, $len
2530 test \$15, %ebx
2531 setnz %al
2532 shl \$4, %rax
2533 sub %rax, $len
2535 sub \$0x80, %rsp # place for tweak[8]
2536 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2538 pxor $twtmp, $twtmp
2539 movdqa .Lxts_magic(%rip), $twmask
2540 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2542 sub \$0x80, $len
2543 jc .Lxts_dec_short
2544 jmp .Lxts_dec_loop
2546 .align 16
2547 .Lxts_dec_loop:
2549 for ($i=0;$i<7;$i++) {
2550 $code.=<<___;
2551 pshufd \$0x13, $twtmp, $twres
2552 pxor $twtmp, $twtmp
2553 movdqa @XMM[7], @XMM[$i]
2554 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2555 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2556 pand $twmask, $twres # isolate carry and residue
2557 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2558 pxor $twres, @XMM[7]
2560 $code.=<<___ if ($i>=1);
2561 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2563 $code.=<<___ if ($i>=2);
2564 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2567 $code.=<<___;
2568 movdqu 0x60($inp), @XMM[8+6]
2569 pxor @XMM[8+5], @XMM[5]
2570 movdqu 0x70($inp), @XMM[8+7]
2571 lea 0x80($inp), $inp
2572 movdqa @XMM[7], 0x70(%rsp)
2573 pxor @XMM[8+6], @XMM[6]
2574 lea 0x80(%rsp), %rax # pass key schedule
2575 pxor @XMM[8+7], @XMM[7]
2576 mov %edx, %r10d # pass rounds
2578 call _bsaes_decrypt8
2580 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2581 pxor 0x10(%rsp), @XMM[1]
2582 movdqu @XMM[0], 0x00($out) # write output
2583 pxor 0x20(%rsp), @XMM[6]
2584 movdqu @XMM[1], 0x10($out)
2585 pxor 0x30(%rsp), @XMM[4]
2586 movdqu @XMM[6], 0x20($out)
2587 pxor 0x40(%rsp), @XMM[2]
2588 movdqu @XMM[4], 0x30($out)
2589 pxor 0x50(%rsp), @XMM[7]
2590 movdqu @XMM[2], 0x40($out)
2591 pxor 0x60(%rsp), @XMM[3]
2592 movdqu @XMM[7], 0x50($out)
2593 pxor 0x70(%rsp), @XMM[5]
2594 movdqu @XMM[3], 0x60($out)
2595 movdqu @XMM[5], 0x70($out)
2596 lea 0x80($out), $out
2598 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2599 pxor $twtmp, $twtmp
2600 movdqa .Lxts_magic(%rip), $twmask
2601 pcmpgtd @XMM[7], $twtmp
2602 pshufd \$0x13, $twtmp, $twres
2603 pxor $twtmp, $twtmp
2604 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2605 pand $twmask, $twres # isolate carry and residue
2606 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2607 pxor $twres, @XMM[7]
2609 sub \$0x80,$len
2610 jnc .Lxts_dec_loop
2612 .Lxts_dec_short:
2613 add \$0x80, $len
2614 jz .Lxts_dec_done
2616 for ($i=0;$i<7;$i++) {
2617 $code.=<<___;
2618 pshufd \$0x13, $twtmp, $twres
2619 pxor $twtmp, $twtmp
2620 movdqa @XMM[7], @XMM[$i]
2621 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2622 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2623 pand $twmask, $twres # isolate carry and residue
2624 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2625 pxor $twres, @XMM[7]
2627 $code.=<<___ if ($i>=1);
2628 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2629 cmp \$`0x10*$i`,$len
2630 je .Lxts_dec_$i
2632 $code.=<<___ if ($i>=2);
2633 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2636 $code.=<<___;
2637 movdqu 0x60($inp), @XMM[8+6]
2638 pxor @XMM[8+5], @XMM[5]
2639 movdqa @XMM[7], 0x70(%rsp)
2640 lea 0x70($inp), $inp
2641 pxor @XMM[8+6], @XMM[6]
2642 lea 0x80(%rsp), %rax # pass key schedule
2643 mov %edx, %r10d # pass rounds
2645 call _bsaes_decrypt8
2647 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2648 pxor 0x10(%rsp), @XMM[1]
2649 movdqu @XMM[0], 0x00($out) # write output
2650 pxor 0x20(%rsp), @XMM[6]
2651 movdqu @XMM[1], 0x10($out)
2652 pxor 0x30(%rsp), @XMM[4]
2653 movdqu @XMM[6], 0x20($out)
2654 pxor 0x40(%rsp), @XMM[2]
2655 movdqu @XMM[4], 0x30($out)
2656 pxor 0x50(%rsp), @XMM[7]
2657 movdqu @XMM[2], 0x40($out)
2658 pxor 0x60(%rsp), @XMM[3]
2659 movdqu @XMM[7], 0x50($out)
2660 movdqu @XMM[3], 0x60($out)
2661 lea 0x70($out), $out
2663 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2664 jmp .Lxts_dec_done
2665 .align 16
2666 .Lxts_dec_6:
2667 pxor @XMM[8+4], @XMM[4]
2668 lea 0x60($inp), $inp
2669 pxor @XMM[8+5], @XMM[5]
2670 lea 0x80(%rsp), %rax # pass key schedule
2671 mov %edx, %r10d # pass rounds
2673 call _bsaes_decrypt8
2675 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2676 pxor 0x10(%rsp), @XMM[1]
2677 movdqu @XMM[0], 0x00($out) # write output
2678 pxor 0x20(%rsp), @XMM[6]
2679 movdqu @XMM[1], 0x10($out)
2680 pxor 0x30(%rsp), @XMM[4]
2681 movdqu @XMM[6], 0x20($out)
2682 pxor 0x40(%rsp), @XMM[2]
2683 movdqu @XMM[4], 0x30($out)
2684 pxor 0x50(%rsp), @XMM[7]
2685 movdqu @XMM[2], 0x40($out)
2686 movdqu @XMM[7], 0x50($out)
2687 lea 0x60($out), $out
2689 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2690 jmp .Lxts_dec_done
2691 .align 16
2692 .Lxts_dec_5:
2693 pxor @XMM[8+3], @XMM[3]
2694 lea 0x50($inp), $inp
2695 pxor @XMM[8+4], @XMM[4]
2696 lea 0x80(%rsp), %rax # pass key schedule
2697 mov %edx, %r10d # pass rounds
2699 call _bsaes_decrypt8
2701 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2702 pxor 0x10(%rsp), @XMM[1]
2703 movdqu @XMM[0], 0x00($out) # write output
2704 pxor 0x20(%rsp), @XMM[6]
2705 movdqu @XMM[1], 0x10($out)
2706 pxor 0x30(%rsp), @XMM[4]
2707 movdqu @XMM[6], 0x20($out)
2708 pxor 0x40(%rsp), @XMM[2]
2709 movdqu @XMM[4], 0x30($out)
2710 movdqu @XMM[2], 0x40($out)
2711 lea 0x50($out), $out
2713 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2714 jmp .Lxts_dec_done
2715 .align 16
2716 .Lxts_dec_4:
2717 pxor @XMM[8+2], @XMM[2]
2718 lea 0x40($inp), $inp
2719 pxor @XMM[8+3], @XMM[3]
2720 lea 0x80(%rsp), %rax # pass key schedule
2721 mov %edx, %r10d # pass rounds
2723 call _bsaes_decrypt8
2725 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2726 pxor 0x10(%rsp), @XMM[1]
2727 movdqu @XMM[0], 0x00($out) # write output
2728 pxor 0x20(%rsp), @XMM[6]
2729 movdqu @XMM[1], 0x10($out)
2730 pxor 0x30(%rsp), @XMM[4]
2731 movdqu @XMM[6], 0x20($out)
2732 movdqu @XMM[4], 0x30($out)
2733 lea 0x40($out), $out
2735 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2736 jmp .Lxts_dec_done
2737 .align 16
2738 .Lxts_dec_3:
2739 pxor @XMM[8+1], @XMM[1]
2740 lea 0x30($inp), $inp
2741 pxor @XMM[8+2], @XMM[2]
2742 lea 0x80(%rsp), %rax # pass key schedule
2743 mov %edx, %r10d # pass rounds
2745 call _bsaes_decrypt8
2747 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2748 pxor 0x10(%rsp), @XMM[1]
2749 movdqu @XMM[0], 0x00($out) # write output
2750 pxor 0x20(%rsp), @XMM[6]
2751 movdqu @XMM[1], 0x10($out)
2752 movdqu @XMM[6], 0x20($out)
2753 lea 0x30($out), $out
2755 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2756 jmp .Lxts_dec_done
2757 .align 16
2758 .Lxts_dec_2:
2759 pxor @XMM[8+0], @XMM[0]
2760 lea 0x20($inp), $inp
2761 pxor @XMM[8+1], @XMM[1]
2762 lea 0x80(%rsp), %rax # pass key schedule
2763 mov %edx, %r10d # pass rounds
2765 call _bsaes_decrypt8
2767 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2768 pxor 0x10(%rsp), @XMM[1]
2769 movdqu @XMM[0], 0x00($out) # write output
2770 movdqu @XMM[1], 0x10($out)
2771 lea 0x20($out), $out
2773 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2774 jmp .Lxts_dec_done
2775 .align 16
2776 .Lxts_dec_1:
2777 pxor @XMM[0], @XMM[8]
2778 lea 0x10($inp), $inp
2779 movdqa @XMM[8], 0x20(%rbp)
2780 lea 0x20(%rbp), $arg1
2781 lea 0x20(%rbp), $arg2
2782 lea ($key), $arg3
2783 call asm_AES_decrypt # doesn't touch %xmm
2784 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2785 #pxor @XMM[8], @XMM[0]
2786 #lea 0x80(%rsp), %rax # pass key schedule
2787 #mov %edx, %r10d # pass rounds
2788 #call _bsaes_decrypt8
2789 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2790 movdqu @XMM[0], 0x00($out) # write output
2791 lea 0x10($out), $out
2793 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2795 .Lxts_dec_done:
2796 and \$15, %ebx
2797 jz .Lxts_dec_ret
2799 pxor $twtmp, $twtmp
2800 movdqa .Lxts_magic(%rip), $twmask
2801 pcmpgtd @XMM[7], $twtmp
2802 pshufd \$0x13, $twtmp, $twres
2803 movdqa @XMM[7], @XMM[6]
2804 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2805 pand $twmask, $twres # isolate carry and residue
2806 movdqu ($inp), @XMM[0]
2807 pxor $twres, @XMM[7]
2809 lea 0x20(%rbp), $arg1
2810 pxor @XMM[7], @XMM[0]
2811 lea 0x20(%rbp), $arg2
2812 movdqa @XMM[0], 0x20(%rbp)
2813 lea ($key), $arg3
2814 call asm_AES_decrypt # doesn't touch %xmm
2815 pxor 0x20(%rbp), @XMM[7]
2816 mov $out, %rdx
2817 movdqu @XMM[7], ($out)
2819 .Lxts_dec_steal:
2820 movzb 16($inp), %eax
2821 movzb (%rdx), %ecx
2822 lea 1($inp), $inp
2823 mov %al, (%rdx)
2824 mov %cl, 16(%rdx)
2825 lea 1(%rdx), %rdx
2826 sub \$1,%ebx
2827 jnz .Lxts_dec_steal
2829 movdqu ($out), @XMM[0]
2830 lea 0x20(%rbp), $arg1
2831 pxor @XMM[6], @XMM[0]
2832 lea 0x20(%rbp), $arg2
2833 movdqa @XMM[0], 0x20(%rbp)
2834 lea ($key), $arg3
2835 call asm_AES_decrypt # doesn't touch %xmm
2836 pxor 0x20(%rbp), @XMM[6]
2837 movdqu @XMM[6], ($out)
2839 .Lxts_dec_ret:
2840 lea (%rsp), %rax
2841 pxor %xmm0, %xmm0
2842 .Lxts_dec_bzero: # wipe key schedule [if any]
2843 movdqa %xmm0, 0x00(%rax)
2844 movdqa %xmm0, 0x10(%rax)
2845 lea 0x20(%rax), %rax
2846 cmp %rax, %rbp
2847 ja .Lxts_dec_bzero
2849 lea (%rbp),%rsp # restore %rsp
2851 $code.=<<___ if ($win64);
2852 movaps 0x40(%rbp), %xmm6
2853 movaps 0x50(%rbp), %xmm7
2854 movaps 0x60(%rbp), %xmm8
2855 movaps 0x70(%rbp), %xmm9
2856 movaps 0x80(%rbp), %xmm10
2857 movaps 0x90(%rbp), %xmm11
2858 movaps 0xa0(%rbp), %xmm12
2859 movaps 0xb0(%rbp), %xmm13
2860 movaps 0xc0(%rbp), %xmm14
2861 movaps 0xd0(%rbp), %xmm15
2862 lea 0xa0(%rbp), %rsp
2864 $code.=<<___;
2865 mov 0x48(%rsp), %r15
2866 mov 0x50(%rsp), %r14
2867 mov 0x58(%rsp), %r13
2868 mov 0x60(%rsp), %r12
2869 mov 0x68(%rsp), %rbx
2870 mov 0x70(%rsp), %rax
2871 lea 0x78(%rsp), %rsp
2872 mov %rax, %rbp
2873 .Lxts_dec_epilogue:
2875 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2878 $code.=<<___;
2879 .type _bsaes_const,\@object
2880 .align 64
2881 _bsaes_const:
2882 .LM0ISR: # InvShiftRows constants
2883 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2884 .LISRM0:
2885 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2886 .LISR:
2887 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2888 .LBS0: # bit-slice constants
2889 .quad 0x5555555555555555, 0x5555555555555555
2890 .LBS1:
2891 .quad 0x3333333333333333, 0x3333333333333333
2892 .LBS2:
2893 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2894 .LSR: # shiftrows constants
2895 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2896 .LSRM0:
2897 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2898 .LM0SR:
2899 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2900 .LSWPUP: # byte-swap upper dword
2901 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2902 .LSWPUPM0SR:
2903 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2904 .LADD1: # counter increment constants
2905 .quad 0x0000000000000000, 0x0000000100000000
2906 .LADD2:
2907 .quad 0x0000000000000000, 0x0000000200000000
2908 .LADD3:
2909 .quad 0x0000000000000000, 0x0000000300000000
2910 .LADD4:
2911 .quad 0x0000000000000000, 0x0000000400000000
2912 .LADD5:
2913 .quad 0x0000000000000000, 0x0000000500000000
2914 .LADD6:
2915 .quad 0x0000000000000000, 0x0000000600000000
2916 .LADD7:
2917 .quad 0x0000000000000000, 0x0000000700000000
2918 .LADD8:
2919 .quad 0x0000000000000000, 0x0000000800000000
2920 .Lxts_magic:
2921 .long 0x87,0,1,0
2922 .Lmasks:
2923 .quad 0x0101010101010101, 0x0101010101010101
2924 .quad 0x0202020202020202, 0x0202020202020202
2925 .quad 0x0404040404040404, 0x0404040404040404
2926 .quad 0x0808080808080808, 0x0808080808080808
2927 .LM0:
2928 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2929 .L63:
2930 .quad 0x6363636363636363, 0x6363636363636363
2931 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2932 .align 64
2933 .size _bsaes_const,.-_bsaes_const
2936 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2937 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2938 if ($win64) {
2939 $rec="%rcx";
2940 $frame="%rdx";
2941 $context="%r8";
2942 $disp="%r9";
2944 $code.=<<___;
2945 .extern __imp_RtlVirtualUnwind
2946 .type se_handler,\@abi-omnipotent
2947 .align 16
2948 se_handler:
2949 push %rsi
2950 push %rdi
2951 push %rbx
2952 push %rbp
2953 push %r12
2954 push %r13
2955 push %r14
2956 push %r15
2957 pushfq
2958 sub \$64,%rsp
2960 mov 120($context),%rax # pull context->Rax
2961 mov 248($context),%rbx # pull context->Rip
2963 mov 8($disp),%rsi # disp->ImageBase
2964 mov 56($disp),%r11 # disp->HandlerData
2966 mov 0(%r11),%r10d # HandlerData[0]
2967 lea (%rsi,%r10),%r10 # prologue label
2968 cmp %r10,%rbx # context->Rip<prologue label
2969 jb .Lin_prologue
2971 mov 152($context),%rax # pull context->Rsp
2973 mov 4(%r11),%r10d # HandlerData[1]
2974 lea (%rsi,%r10),%r10 # epilogue label
2975 cmp %r10,%rbx # context->Rip>=epilogue label
2976 jae .Lin_prologue
2978 mov 160($context),%rax # pull context->Rbp
2980 lea 0x40(%rax),%rsi # %xmm save area
2981 lea 512($context),%rdi # &context.Xmm6
2982 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2983 .long 0xa548f3fc # cld; rep movsq
2984 lea 0xa0(%rax),%rax # adjust stack pointer
2986 mov 0x70(%rax),%rbp
2987 mov 0x68(%rax),%rbx
2988 mov 0x60(%rax),%r12
2989 mov 0x58(%rax),%r13
2990 mov 0x50(%rax),%r14
2991 mov 0x48(%rax),%r15
2992 lea 0x78(%rax),%rax # adjust stack pointer
2993 mov %rbx,144($context) # restore context->Rbx
2994 mov %rbp,160($context) # restore context->Rbp
2995 mov %r12,216($context) # restore context->R12
2996 mov %r13,224($context) # restore context->R13
2997 mov %r14,232($context) # restore context->R14
2998 mov %r15,240($context) # restore context->R15
3000 .Lin_prologue:
3001 mov %rax,152($context) # restore context->Rsp
3003 mov 40($disp),%rdi # disp->ContextRecord
3004 mov $context,%rsi # context
3005 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3006 .long 0xa548f3fc # cld; rep movsq
3008 mov $disp,%rsi
3009 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3010 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3011 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3012 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3013 mov 40(%rsi),%r10 # disp->ContextRecord
3014 lea 56(%rsi),%r11 # &disp->HandlerData
3015 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3016 mov %r10,32(%rsp) # arg5
3017 mov %r11,40(%rsp) # arg6
3018 mov %r12,48(%rsp) # arg7
3019 mov %rcx,56(%rsp) # arg8, (NULL)
3020 call *__imp_RtlVirtualUnwind(%rip)
3022 mov \$1,%eax # ExceptionContinueSearch
3023 add \$64,%rsp
3024 popfq
3025 pop %r15
3026 pop %r14
3027 pop %r13
3028 pop %r12
3029 pop %rbp
3030 pop %rbx
3031 pop %rdi
3032 pop %rsi
3034 .size se_handler,.-se_handler
3036 .section .pdata
3037 .align 4
3039 $code.=<<___ if ($ecb);
3040 .rva .Lecb_enc_prologue
3041 .rva .Lecb_enc_epilogue
3042 .rva .Lecb_enc_info
3044 .rva .Lecb_dec_prologue
3045 .rva .Lecb_dec_epilogue
3046 .rva .Lecb_dec_info
3048 $code.=<<___;
3049 .rva .Lcbc_dec_prologue
3050 .rva .Lcbc_dec_epilogue
3051 .rva .Lcbc_dec_info
3053 .rva .Lctr_enc_prologue
3054 .rva .Lctr_enc_epilogue
3055 .rva .Lctr_enc_info
3057 .rva .Lxts_enc_prologue
3058 .rva .Lxts_enc_epilogue
3059 .rva .Lxts_enc_info
3061 .rva .Lxts_dec_prologue
3062 .rva .Lxts_dec_epilogue
3063 .rva .Lxts_dec_info
3065 .section .xdata
3066 .align 8
3068 $code.=<<___ if ($ecb);
3069 .Lecb_enc_info:
3070 .byte 9,0,0,0
3071 .rva se_handler
3072 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3073 .Lecb_dec_info:
3074 .byte 9,0,0,0
3075 .rva se_handler
3076 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3078 $code.=<<___;
3079 .Lcbc_dec_info:
3080 .byte 9,0,0,0
3081 .rva se_handler
3082 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3083 .Lctr_enc_info:
3084 .byte 9,0,0,0
3085 .rva se_handler
3086 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3087 .Lxts_enc_info:
3088 .byte 9,0,0,0
3089 .rva se_handler
3090 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3091 .Lxts_dec_info:
3092 .byte 9,0,0,0
3093 .rva se_handler
3094 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3098 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
3100 print $code;
3102 close STDOUT;