OpenSSL: update to 1.0.2a
[tomato.git] / release / src / router / openssl / crypto / aes / asm / aesni-x86.pl
blob3deb86aed636e11e8ad9136cd8db4c71c27b0ebd
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13 # details].
15 # Performance.
17 # To start with see corresponding paragraph in aesni-x86_64.pl...
18 # Instead of filling table similar to one found there I've chosen to
19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20 # The simplified table below represents 32-bit performance relative
21 # to 64-bit one in every given point. Ratios vary for different
22 # encryption modes, therefore interval values.
24 # 16-byte 64-byte 256-byte 1-KB 8-KB
25 # 53-67% 67-84% 91-94% 95-98% 97-99.5%
27 # Lower ratios for smaller block sizes are perfectly understandable,
28 # because function call overhead is higher in 32-bit mode. Largest
29 # 8-KB block performance is virtually same: 32-bit code is less than
30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
32 # January 2011
34 # See aesni-x86_64.pl for details. Unlike x86_64 version this module
35 # interleaves at most 6 aes[enc|dec] instructions, because there are
36 # not enough registers for 8x interleave [which should be optimal for
37 # Sandy Bridge]. Actually, performance results for 6x interleave
38 # factor presented in aesni-x86_64.pl (except for CTR) are for this
39 # module.
41 # April 2011
43 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
46 ######################################################################
47 # Current large-block performance in cycles per byte processed with
48 # 128-bit key (less is better).
50 # CBC en-/decrypt CTR XTS ECB
51 # Westmere 3.77/1.37 1.37 1.52 1.27
52 # * Bridge 5.07/0.98 0.99 1.09 0.91
53 # Haswell 4.44/0.80 0.97 1.03 0.72
54 # Atom 5.77/3.56 3.67 4.03 3.46
55 # Bulldozer 5.80/0.98 1.05 1.24 0.93
57 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
58 # generates drop-in replacement for
59 # crypto/aes/asm/aes-586.pl:-)
60 $inline=1; # inline _aesni_[en|de]crypt
62 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63 push(@INC,"${dir}","${dir}../../perlasm");
64 require "x86asm.pl";
66 &asm_init($ARGV[0],$0);
68 if ($PREFIX eq "aesni") { $movekey=\&movups; }
69 else { $movekey=\&movups; }
71 $len="eax";
72 $rounds="ecx";
73 $key="edx";
74 $inp="esi";
75 $out="edi";
76 $rounds_="ebx"; # backup copy for $rounds
77 $key_="ebp"; # backup copy for $key
79 $rndkey0="xmm0";
80 $rndkey1="xmm1";
81 $inout0="xmm2";
82 $inout1="xmm3";
83 $inout2="xmm4";
84 $inout3="xmm5"; $in1="xmm5";
85 $inout4="xmm6"; $in0="xmm6";
86 $inout5="xmm7"; $ivec="xmm7";
88 # AESNI extenstion
89 sub aeskeygenassist
90 { my($dst,$src,$imm)=@_;
91 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
92 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
94 sub aescommon
95 { my($opcodelet,$dst,$src)=@_;
96 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
97 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
99 sub aesimc { aescommon(0xdb,@_); }
100 sub aesenc { aescommon(0xdc,@_); }
101 sub aesenclast { aescommon(0xdd,@_); }
102 sub aesdec { aescommon(0xde,@_); }
103 sub aesdeclast { aescommon(0xdf,@_); }
105 # Inline version of internal aesni_[en|de]crypt1
106 { my $sn;
107 sub aesni_inline_generate1
108 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
109 $sn++;
111 &$movekey ($rndkey0,&QWP(0,$key));
112 &$movekey ($rndkey1,&QWP(16,$key));
113 &xorps ($ivec,$rndkey0) if (defined($ivec));
114 &lea ($key,&DWP(32,$key));
115 &xorps ($inout,$ivec) if (defined($ivec));
116 &xorps ($inout,$rndkey0) if (!defined($ivec));
117 &set_label("${p}1_loop_$sn");
118 eval"&aes${p} ($inout,$rndkey1)";
119 &dec ($rounds);
120 &$movekey ($rndkey1,&QWP(0,$key));
121 &lea ($key,&DWP(16,$key));
122 &jnz (&label("${p}1_loop_$sn"));
123 eval"&aes${p}last ($inout,$rndkey1)";
126 sub aesni_generate1 # fully unrolled loop
127 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
129 &function_begin_B("_aesni_${p}rypt1");
130 &movups ($rndkey0,&QWP(0,$key));
131 &$movekey ($rndkey1,&QWP(0x10,$key));
132 &xorps ($inout,$rndkey0);
133 &$movekey ($rndkey0,&QWP(0x20,$key));
134 &lea ($key,&DWP(0x30,$key));
135 &cmp ($rounds,11);
136 &jb (&label("${p}128"));
137 &lea ($key,&DWP(0x20,$key));
138 &je (&label("${p}192"));
139 &lea ($key,&DWP(0x20,$key));
140 eval"&aes${p} ($inout,$rndkey1)";
141 &$movekey ($rndkey1,&QWP(-0x40,$key));
142 eval"&aes${p} ($inout,$rndkey0)";
143 &$movekey ($rndkey0,&QWP(-0x30,$key));
144 &set_label("${p}192");
145 eval"&aes${p} ($inout,$rndkey1)";
146 &$movekey ($rndkey1,&QWP(-0x20,$key));
147 eval"&aes${p} ($inout,$rndkey0)";
148 &$movekey ($rndkey0,&QWP(-0x10,$key));
149 &set_label("${p}128");
150 eval"&aes${p} ($inout,$rndkey1)";
151 &$movekey ($rndkey1,&QWP(0,$key));
152 eval"&aes${p} ($inout,$rndkey0)";
153 &$movekey ($rndkey0,&QWP(0x10,$key));
154 eval"&aes${p} ($inout,$rndkey1)";
155 &$movekey ($rndkey1,&QWP(0x20,$key));
156 eval"&aes${p} ($inout,$rndkey0)";
157 &$movekey ($rndkey0,&QWP(0x30,$key));
158 eval"&aes${p} ($inout,$rndkey1)";
159 &$movekey ($rndkey1,&QWP(0x40,$key));
160 eval"&aes${p} ($inout,$rndkey0)";
161 &$movekey ($rndkey0,&QWP(0x50,$key));
162 eval"&aes${p} ($inout,$rndkey1)";
163 &$movekey ($rndkey1,&QWP(0x60,$key));
164 eval"&aes${p} ($inout,$rndkey0)";
165 &$movekey ($rndkey0,&QWP(0x70,$key));
166 eval"&aes${p} ($inout,$rndkey1)";
167 eval"&aes${p}last ($inout,$rndkey0)";
168 &ret();
169 &function_end_B("_aesni_${p}rypt1");
172 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
173 &aesni_generate1("enc") if (!$inline);
174 &function_begin_B("${PREFIX}_encrypt");
175 &mov ("eax",&wparam(0));
176 &mov ($key,&wparam(2));
177 &movups ($inout0,&QWP(0,"eax"));
178 &mov ($rounds,&DWP(240,$key));
179 &mov ("eax",&wparam(1));
180 if ($inline)
181 { &aesni_inline_generate1("enc"); }
182 else
183 { &call ("_aesni_encrypt1"); }
184 &movups (&QWP(0,"eax"),$inout0);
185 &ret ();
186 &function_end_B("${PREFIX}_encrypt");
188 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
189 &aesni_generate1("dec") if(!$inline);
190 &function_begin_B("${PREFIX}_decrypt");
191 &mov ("eax",&wparam(0));
192 &mov ($key,&wparam(2));
193 &movups ($inout0,&QWP(0,"eax"));
194 &mov ($rounds,&DWP(240,$key));
195 &mov ("eax",&wparam(1));
196 if ($inline)
197 { &aesni_inline_generate1("dec"); }
198 else
199 { &call ("_aesni_decrypt1"); }
200 &movups (&QWP(0,"eax"),$inout0);
201 &ret ();
202 &function_end_B("${PREFIX}_decrypt");
204 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
205 # factor. Why 3x subroutine were originally used in loops? Even though
206 # aes[enc|dec] latency was originally 6, it could be scheduled only
207 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
208 # utilization, i.e. when subroutine's throughput is virtually same as
209 # of non-interleaved subroutine [for number of input blocks up to 3].
210 # This is why it originally made no sense to implement 2x subroutine.
211 # But times change and it became appropriate to spend extra 192 bytes
212 # on 2x subroutine on Atom Silvermont account. For processors that
213 # can schedule aes[enc|dec] every cycle optimal interleave factor
214 # equals to corresponding instructions latency. 8x is optimal for
215 # * Bridge, but it's unfeasible to accommodate such implementation
216 # in XMM registers addreassable in 32-bit mode and therefore maximum
217 # of 6x is used instead...
219 sub aesni_generate2
220 { my $p=shift;
222 &function_begin_B("_aesni_${p}rypt2");
223 &$movekey ($rndkey0,&QWP(0,$key));
224 &shl ($rounds,4);
225 &$movekey ($rndkey1,&QWP(16,$key));
226 &xorps ($inout0,$rndkey0);
227 &pxor ($inout1,$rndkey0);
228 &$movekey ($rndkey0,&QWP(32,$key));
229 &lea ($key,&DWP(32,$key,$rounds));
230 &neg ($rounds);
231 &add ($rounds,16);
233 &set_label("${p}2_loop");
234 eval"&aes${p} ($inout0,$rndkey1)";
235 eval"&aes${p} ($inout1,$rndkey1)";
236 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
237 &add ($rounds,32);
238 eval"&aes${p} ($inout0,$rndkey0)";
239 eval"&aes${p} ($inout1,$rndkey0)";
240 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
241 &jnz (&label("${p}2_loop"));
242 eval"&aes${p} ($inout0,$rndkey1)";
243 eval"&aes${p} ($inout1,$rndkey1)";
244 eval"&aes${p}last ($inout0,$rndkey0)";
245 eval"&aes${p}last ($inout1,$rndkey0)";
246 &ret();
247 &function_end_B("_aesni_${p}rypt2");
250 sub aesni_generate3
251 { my $p=shift;
253 &function_begin_B("_aesni_${p}rypt3");
254 &$movekey ($rndkey0,&QWP(0,$key));
255 &shl ($rounds,4);
256 &$movekey ($rndkey1,&QWP(16,$key));
257 &xorps ($inout0,$rndkey0);
258 &pxor ($inout1,$rndkey0);
259 &pxor ($inout2,$rndkey0);
260 &$movekey ($rndkey0,&QWP(32,$key));
261 &lea ($key,&DWP(32,$key,$rounds));
262 &neg ($rounds);
263 &add ($rounds,16);
265 &set_label("${p}3_loop");
266 eval"&aes${p} ($inout0,$rndkey1)";
267 eval"&aes${p} ($inout1,$rndkey1)";
268 eval"&aes${p} ($inout2,$rndkey1)";
269 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
270 &add ($rounds,32);
271 eval"&aes${p} ($inout0,$rndkey0)";
272 eval"&aes${p} ($inout1,$rndkey0)";
273 eval"&aes${p} ($inout2,$rndkey0)";
274 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
275 &jnz (&label("${p}3_loop"));
276 eval"&aes${p} ($inout0,$rndkey1)";
277 eval"&aes${p} ($inout1,$rndkey1)";
278 eval"&aes${p} ($inout2,$rndkey1)";
279 eval"&aes${p}last ($inout0,$rndkey0)";
280 eval"&aes${p}last ($inout1,$rndkey0)";
281 eval"&aes${p}last ($inout2,$rndkey0)";
282 &ret();
283 &function_end_B("_aesni_${p}rypt3");
286 # 4x interleave is implemented to improve small block performance,
287 # most notably [and naturally] 4 block by ~30%. One can argue that one
288 # should have implemented 5x as well, but improvement would be <20%,
289 # so it's not worth it...
290 sub aesni_generate4
291 { my $p=shift;
293 &function_begin_B("_aesni_${p}rypt4");
294 &$movekey ($rndkey0,&QWP(0,$key));
295 &$movekey ($rndkey1,&QWP(16,$key));
296 &shl ($rounds,4);
297 &xorps ($inout0,$rndkey0);
298 &pxor ($inout1,$rndkey0);
299 &pxor ($inout2,$rndkey0);
300 &pxor ($inout3,$rndkey0);
301 &$movekey ($rndkey0,&QWP(32,$key));
302 &lea ($key,&DWP(32,$key,$rounds));
303 &neg ($rounds);
304 &data_byte (0x0f,0x1f,0x40,0x00);
305 &add ($rounds,16);
307 &set_label("${p}4_loop");
308 eval"&aes${p} ($inout0,$rndkey1)";
309 eval"&aes${p} ($inout1,$rndkey1)";
310 eval"&aes${p} ($inout2,$rndkey1)";
311 eval"&aes${p} ($inout3,$rndkey1)";
312 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
313 &add ($rounds,32);
314 eval"&aes${p} ($inout0,$rndkey0)";
315 eval"&aes${p} ($inout1,$rndkey0)";
316 eval"&aes${p} ($inout2,$rndkey0)";
317 eval"&aes${p} ($inout3,$rndkey0)";
318 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
319 &jnz (&label("${p}4_loop"));
321 eval"&aes${p} ($inout0,$rndkey1)";
322 eval"&aes${p} ($inout1,$rndkey1)";
323 eval"&aes${p} ($inout2,$rndkey1)";
324 eval"&aes${p} ($inout3,$rndkey1)";
325 eval"&aes${p}last ($inout0,$rndkey0)";
326 eval"&aes${p}last ($inout1,$rndkey0)";
327 eval"&aes${p}last ($inout2,$rndkey0)";
328 eval"&aes${p}last ($inout3,$rndkey0)";
329 &ret();
330 &function_end_B("_aesni_${p}rypt4");
333 sub aesni_generate6
334 { my $p=shift;
336 &function_begin_B("_aesni_${p}rypt6");
337 &static_label("_aesni_${p}rypt6_enter");
338 &$movekey ($rndkey0,&QWP(0,$key));
339 &shl ($rounds,4);
340 &$movekey ($rndkey1,&QWP(16,$key));
341 &xorps ($inout0,$rndkey0);
342 &pxor ($inout1,$rndkey0); # pxor does better here
343 &pxor ($inout2,$rndkey0);
344 eval"&aes${p} ($inout0,$rndkey1)";
345 &pxor ($inout3,$rndkey0);
346 &pxor ($inout4,$rndkey0);
347 eval"&aes${p} ($inout1,$rndkey1)";
348 &lea ($key,&DWP(32,$key,$rounds));
349 &neg ($rounds);
350 eval"&aes${p} ($inout2,$rndkey1)";
351 &pxor ($inout5,$rndkey0);
352 &add ($rounds,16);
353 eval"&aes${p} ($inout3,$rndkey1)";
354 eval"&aes${p} ($inout4,$rndkey1)";
355 eval"&aes${p} ($inout5,$rndkey1)";
356 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
357 &jmp (&label("_aesni_${p}rypt6_enter"));
359 &set_label("${p}6_loop",16);
360 eval"&aes${p} ($inout0,$rndkey1)";
361 eval"&aes${p} ($inout1,$rndkey1)";
362 eval"&aes${p} ($inout2,$rndkey1)";
363 eval"&aes${p} ($inout3,$rndkey1)";
364 eval"&aes${p} ($inout4,$rndkey1)";
365 eval"&aes${p} ($inout5,$rndkey1)";
366 &set_label("_aesni_${p}rypt6_enter");
367 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
368 &add ($rounds,32);
369 eval"&aes${p} ($inout0,$rndkey0)";
370 eval"&aes${p} ($inout1,$rndkey0)";
371 eval"&aes${p} ($inout2,$rndkey0)";
372 eval"&aes${p} ($inout3,$rndkey0)";
373 eval"&aes${p} ($inout4,$rndkey0)";
374 eval"&aes${p} ($inout5,$rndkey0)";
375 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
376 &jnz (&label("${p}6_loop"));
378 eval"&aes${p} ($inout0,$rndkey1)";
379 eval"&aes${p} ($inout1,$rndkey1)";
380 eval"&aes${p} ($inout2,$rndkey1)";
381 eval"&aes${p} ($inout3,$rndkey1)";
382 eval"&aes${p} ($inout4,$rndkey1)";
383 eval"&aes${p} ($inout5,$rndkey1)";
384 eval"&aes${p}last ($inout0,$rndkey0)";
385 eval"&aes${p}last ($inout1,$rndkey0)";
386 eval"&aes${p}last ($inout2,$rndkey0)";
387 eval"&aes${p}last ($inout3,$rndkey0)";
388 eval"&aes${p}last ($inout4,$rndkey0)";
389 eval"&aes${p}last ($inout5,$rndkey0)";
390 &ret();
391 &function_end_B("_aesni_${p}rypt6");
393 &aesni_generate2("enc") if ($PREFIX eq "aesni");
394 &aesni_generate2("dec");
395 &aesni_generate3("enc") if ($PREFIX eq "aesni");
396 &aesni_generate3("dec");
397 &aesni_generate4("enc") if ($PREFIX eq "aesni");
398 &aesni_generate4("dec");
399 &aesni_generate6("enc") if ($PREFIX eq "aesni");
400 &aesni_generate6("dec");
402 if ($PREFIX eq "aesni") {
403 ######################################################################
404 # void aesni_ecb_encrypt (const void *in, void *out,
405 # size_t length, const AES_KEY *key,
406 # int enc);
407 &function_begin("aesni_ecb_encrypt");
408 &mov ($inp,&wparam(0));
409 &mov ($out,&wparam(1));
410 &mov ($len,&wparam(2));
411 &mov ($key,&wparam(3));
412 &mov ($rounds_,&wparam(4));
413 &and ($len,-16);
414 &jz (&label("ecb_ret"));
415 &mov ($rounds,&DWP(240,$key));
416 &test ($rounds_,$rounds_);
417 &jz (&label("ecb_decrypt"));
419 &mov ($key_,$key); # backup $key
420 &mov ($rounds_,$rounds); # backup $rounds
421 &cmp ($len,0x60);
422 &jb (&label("ecb_enc_tail"));
424 &movdqu ($inout0,&QWP(0,$inp));
425 &movdqu ($inout1,&QWP(0x10,$inp));
426 &movdqu ($inout2,&QWP(0x20,$inp));
427 &movdqu ($inout3,&QWP(0x30,$inp));
428 &movdqu ($inout4,&QWP(0x40,$inp));
429 &movdqu ($inout5,&QWP(0x50,$inp));
430 &lea ($inp,&DWP(0x60,$inp));
431 &sub ($len,0x60);
432 &jmp (&label("ecb_enc_loop6_enter"));
434 &set_label("ecb_enc_loop6",16);
435 &movups (&QWP(0,$out),$inout0);
436 &movdqu ($inout0,&QWP(0,$inp));
437 &movups (&QWP(0x10,$out),$inout1);
438 &movdqu ($inout1,&QWP(0x10,$inp));
439 &movups (&QWP(0x20,$out),$inout2);
440 &movdqu ($inout2,&QWP(0x20,$inp));
441 &movups (&QWP(0x30,$out),$inout3);
442 &movdqu ($inout3,&QWP(0x30,$inp));
443 &movups (&QWP(0x40,$out),$inout4);
444 &movdqu ($inout4,&QWP(0x40,$inp));
445 &movups (&QWP(0x50,$out),$inout5);
446 &lea ($out,&DWP(0x60,$out));
447 &movdqu ($inout5,&QWP(0x50,$inp));
448 &lea ($inp,&DWP(0x60,$inp));
449 &set_label("ecb_enc_loop6_enter");
451 &call ("_aesni_encrypt6");
453 &mov ($key,$key_); # restore $key
454 &mov ($rounds,$rounds_); # restore $rounds
455 &sub ($len,0x60);
456 &jnc (&label("ecb_enc_loop6"));
458 &movups (&QWP(0,$out),$inout0);
459 &movups (&QWP(0x10,$out),$inout1);
460 &movups (&QWP(0x20,$out),$inout2);
461 &movups (&QWP(0x30,$out),$inout3);
462 &movups (&QWP(0x40,$out),$inout4);
463 &movups (&QWP(0x50,$out),$inout5);
464 &lea ($out,&DWP(0x60,$out));
465 &add ($len,0x60);
466 &jz (&label("ecb_ret"));
468 &set_label("ecb_enc_tail");
469 &movups ($inout0,&QWP(0,$inp));
470 &cmp ($len,0x20);
471 &jb (&label("ecb_enc_one"));
472 &movups ($inout1,&QWP(0x10,$inp));
473 &je (&label("ecb_enc_two"));
474 &movups ($inout2,&QWP(0x20,$inp));
475 &cmp ($len,0x40);
476 &jb (&label("ecb_enc_three"));
477 &movups ($inout3,&QWP(0x30,$inp));
478 &je (&label("ecb_enc_four"));
479 &movups ($inout4,&QWP(0x40,$inp));
480 &xorps ($inout5,$inout5);
481 &call ("_aesni_encrypt6");
482 &movups (&QWP(0,$out),$inout0);
483 &movups (&QWP(0x10,$out),$inout1);
484 &movups (&QWP(0x20,$out),$inout2);
485 &movups (&QWP(0x30,$out),$inout3);
486 &movups (&QWP(0x40,$out),$inout4);
487 jmp (&label("ecb_ret"));
489 &set_label("ecb_enc_one",16);
490 if ($inline)
491 { &aesni_inline_generate1("enc"); }
492 else
493 { &call ("_aesni_encrypt1"); }
494 &movups (&QWP(0,$out),$inout0);
495 &jmp (&label("ecb_ret"));
497 &set_label("ecb_enc_two",16);
498 &call ("_aesni_encrypt2");
499 &movups (&QWP(0,$out),$inout0);
500 &movups (&QWP(0x10,$out),$inout1);
501 &jmp (&label("ecb_ret"));
503 &set_label("ecb_enc_three",16);
504 &call ("_aesni_encrypt3");
505 &movups (&QWP(0,$out),$inout0);
506 &movups (&QWP(0x10,$out),$inout1);
507 &movups (&QWP(0x20,$out),$inout2);
508 &jmp (&label("ecb_ret"));
510 &set_label("ecb_enc_four",16);
511 &call ("_aesni_encrypt4");
512 &movups (&QWP(0,$out),$inout0);
513 &movups (&QWP(0x10,$out),$inout1);
514 &movups (&QWP(0x20,$out),$inout2);
515 &movups (&QWP(0x30,$out),$inout3);
516 &jmp (&label("ecb_ret"));
517 ######################################################################
518 &set_label("ecb_decrypt",16);
519 &mov ($key_,$key); # backup $key
520 &mov ($rounds_,$rounds); # backup $rounds
521 &cmp ($len,0x60);
522 &jb (&label("ecb_dec_tail"));
524 &movdqu ($inout0,&QWP(0,$inp));
525 &movdqu ($inout1,&QWP(0x10,$inp));
526 &movdqu ($inout2,&QWP(0x20,$inp));
527 &movdqu ($inout3,&QWP(0x30,$inp));
528 &movdqu ($inout4,&QWP(0x40,$inp));
529 &movdqu ($inout5,&QWP(0x50,$inp));
530 &lea ($inp,&DWP(0x60,$inp));
531 &sub ($len,0x60);
532 &jmp (&label("ecb_dec_loop6_enter"));
534 &set_label("ecb_dec_loop6",16);
535 &movups (&QWP(0,$out),$inout0);
536 &movdqu ($inout0,&QWP(0,$inp));
537 &movups (&QWP(0x10,$out),$inout1);
538 &movdqu ($inout1,&QWP(0x10,$inp));
539 &movups (&QWP(0x20,$out),$inout2);
540 &movdqu ($inout2,&QWP(0x20,$inp));
541 &movups (&QWP(0x30,$out),$inout3);
542 &movdqu ($inout3,&QWP(0x30,$inp));
543 &movups (&QWP(0x40,$out),$inout4);
544 &movdqu ($inout4,&QWP(0x40,$inp));
545 &movups (&QWP(0x50,$out),$inout5);
546 &lea ($out,&DWP(0x60,$out));
547 &movdqu ($inout5,&QWP(0x50,$inp));
548 &lea ($inp,&DWP(0x60,$inp));
549 &set_label("ecb_dec_loop6_enter");
551 &call ("_aesni_decrypt6");
553 &mov ($key,$key_); # restore $key
554 &mov ($rounds,$rounds_); # restore $rounds
555 &sub ($len,0x60);
556 &jnc (&label("ecb_dec_loop6"));
558 &movups (&QWP(0,$out),$inout0);
559 &movups (&QWP(0x10,$out),$inout1);
560 &movups (&QWP(0x20,$out),$inout2);
561 &movups (&QWP(0x30,$out),$inout3);
562 &movups (&QWP(0x40,$out),$inout4);
563 &movups (&QWP(0x50,$out),$inout5);
564 &lea ($out,&DWP(0x60,$out));
565 &add ($len,0x60);
566 &jz (&label("ecb_ret"));
568 &set_label("ecb_dec_tail");
569 &movups ($inout0,&QWP(0,$inp));
570 &cmp ($len,0x20);
571 &jb (&label("ecb_dec_one"));
572 &movups ($inout1,&QWP(0x10,$inp));
573 &je (&label("ecb_dec_two"));
574 &movups ($inout2,&QWP(0x20,$inp));
575 &cmp ($len,0x40);
576 &jb (&label("ecb_dec_three"));
577 &movups ($inout3,&QWP(0x30,$inp));
578 &je (&label("ecb_dec_four"));
579 &movups ($inout4,&QWP(0x40,$inp));
580 &xorps ($inout5,$inout5);
581 &call ("_aesni_decrypt6");
582 &movups (&QWP(0,$out),$inout0);
583 &movups (&QWP(0x10,$out),$inout1);
584 &movups (&QWP(0x20,$out),$inout2);
585 &movups (&QWP(0x30,$out),$inout3);
586 &movups (&QWP(0x40,$out),$inout4);
587 &jmp (&label("ecb_ret"));
589 &set_label("ecb_dec_one",16);
590 if ($inline)
591 { &aesni_inline_generate1("dec"); }
592 else
593 { &call ("_aesni_decrypt1"); }
594 &movups (&QWP(0,$out),$inout0);
595 &jmp (&label("ecb_ret"));
597 &set_label("ecb_dec_two",16);
598 &call ("_aesni_decrypt2");
599 &movups (&QWP(0,$out),$inout0);
600 &movups (&QWP(0x10,$out),$inout1);
601 &jmp (&label("ecb_ret"));
603 &set_label("ecb_dec_three",16);
604 &call ("_aesni_decrypt3");
605 &movups (&QWP(0,$out),$inout0);
606 &movups (&QWP(0x10,$out),$inout1);
607 &movups (&QWP(0x20,$out),$inout2);
608 &jmp (&label("ecb_ret"));
610 &set_label("ecb_dec_four",16);
611 &call ("_aesni_decrypt4");
612 &movups (&QWP(0,$out),$inout0);
613 &movups (&QWP(0x10,$out),$inout1);
614 &movups (&QWP(0x20,$out),$inout2);
615 &movups (&QWP(0x30,$out),$inout3);
617 &set_label("ecb_ret");
618 &function_end("aesni_ecb_encrypt");
620 ######################################################################
621 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
622 # size_t blocks, const AES_KEY *key,
623 # const char *ivec,char *cmac);
625 # Handles only complete blocks, operates on 64-bit counter and
626 # does not update *ivec! Nor does it finalize CMAC value
627 # (see engine/eng_aesni.c for details)
629 { my $cmac=$inout1;
630 &function_begin("aesni_ccm64_encrypt_blocks");
631 &mov ($inp,&wparam(0));
632 &mov ($out,&wparam(1));
633 &mov ($len,&wparam(2));
634 &mov ($key,&wparam(3));
635 &mov ($rounds_,&wparam(4));
636 &mov ($rounds,&wparam(5));
637 &mov ($key_,"esp");
638 &sub ("esp",60);
639 &and ("esp",-16); # align stack
640 &mov (&DWP(48,"esp"),$key_);
642 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
643 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
644 &mov ($rounds,&DWP(240,$key));
646 # compose byte-swap control mask for pshufb on stack
647 &mov (&DWP(0,"esp"),0x0c0d0e0f);
648 &mov (&DWP(4,"esp"),0x08090a0b);
649 &mov (&DWP(8,"esp"),0x04050607);
650 &mov (&DWP(12,"esp"),0x00010203);
652 # compose counter increment vector on stack
653 &mov ($rounds_,1);
654 &xor ($key_,$key_);
655 &mov (&DWP(16,"esp"),$rounds_);
656 &mov (&DWP(20,"esp"),$key_);
657 &mov (&DWP(24,"esp"),$key_);
658 &mov (&DWP(28,"esp"),$key_);
660 &shl ($rounds,4);
661 &mov ($rounds_,16);
662 &lea ($key_,&DWP(0,$key));
663 &movdqa ($inout3,&QWP(0,"esp"));
664 &movdqa ($inout0,$ivec);
665 &lea ($key,&DWP(32,$key,$rounds));
666 &sub ($rounds_,$rounds);
667 &pshufb ($ivec,$inout3);
669 &set_label("ccm64_enc_outer");
670 &$movekey ($rndkey0,&QWP(0,$key_));
671 &mov ($rounds,$rounds_);
672 &movups ($in0,&QWP(0,$inp));
674 &xorps ($inout0,$rndkey0);
675 &$movekey ($rndkey1,&QWP(16,$key_));
676 &xorps ($rndkey0,$in0);
677 &xorps ($cmac,$rndkey0); # cmac^=inp
678 &$movekey ($rndkey0,&QWP(32,$key_));
680 &set_label("ccm64_enc2_loop");
681 &aesenc ($inout0,$rndkey1);
682 &aesenc ($cmac,$rndkey1);
683 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
684 &add ($rounds,32);
685 &aesenc ($inout0,$rndkey0);
686 &aesenc ($cmac,$rndkey0);
687 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
688 &jnz (&label("ccm64_enc2_loop"));
689 &aesenc ($inout0,$rndkey1);
690 &aesenc ($cmac,$rndkey1);
691 &paddq ($ivec,&QWP(16,"esp"));
692 &dec ($len);
693 &aesenclast ($inout0,$rndkey0);
694 &aesenclast ($cmac,$rndkey0);
696 &lea ($inp,&DWP(16,$inp));
697 &xorps ($in0,$inout0); # inp^=E(ivec)
698 &movdqa ($inout0,$ivec);
699 &movups (&QWP(0,$out),$in0); # save output
700 &pshufb ($inout0,$inout3);
701 &lea ($out,&DWP(16,$out));
702 &jnz (&label("ccm64_enc_outer"));
704 &mov ("esp",&DWP(48,"esp"));
705 &mov ($out,&wparam(5));
706 &movups (&QWP(0,$out),$cmac);
707 &function_end("aesni_ccm64_encrypt_blocks");
709 &function_begin("aesni_ccm64_decrypt_blocks");
710 &mov ($inp,&wparam(0));
711 &mov ($out,&wparam(1));
712 &mov ($len,&wparam(2));
713 &mov ($key,&wparam(3));
714 &mov ($rounds_,&wparam(4));
715 &mov ($rounds,&wparam(5));
716 &mov ($key_,"esp");
717 &sub ("esp",60);
718 &and ("esp",-16); # align stack
719 &mov (&DWP(48,"esp"),$key_);
721 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
722 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
723 &mov ($rounds,&DWP(240,$key));
725 # compose byte-swap control mask for pshufb on stack
726 &mov (&DWP(0,"esp"),0x0c0d0e0f);
727 &mov (&DWP(4,"esp"),0x08090a0b);
728 &mov (&DWP(8,"esp"),0x04050607);
729 &mov (&DWP(12,"esp"),0x00010203);
731 # compose counter increment vector on stack
732 &mov ($rounds_,1);
733 &xor ($key_,$key_);
734 &mov (&DWP(16,"esp"),$rounds_);
735 &mov (&DWP(20,"esp"),$key_);
736 &mov (&DWP(24,"esp"),$key_);
737 &mov (&DWP(28,"esp"),$key_);
739 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
740 &movdqa ($inout0,$ivec);
742 &mov ($key_,$key);
743 &mov ($rounds_,$rounds);
745 &pshufb ($ivec,$inout3);
746 if ($inline)
747 { &aesni_inline_generate1("enc"); }
748 else
749 { &call ("_aesni_encrypt1"); }
750 &shl ($rounds_,4);
751 &mov ($rounds,16);
752 &movups ($in0,&QWP(0,$inp)); # load inp
753 &paddq ($ivec,&QWP(16,"esp"));
754 &lea ($inp,&QWP(16,$inp));
755 &sub ($rounds,$rounds_);
756 &lea ($key,&DWP(32,$key_,$rounds_));
757 &mov ($rounds_,$rounds);
758 &jmp (&label("ccm64_dec_outer"));
760 &set_label("ccm64_dec_outer",16);
761 &xorps ($in0,$inout0); # inp ^= E(ivec)
762 &movdqa ($inout0,$ivec);
763 &movups (&QWP(0,$out),$in0); # save output
764 &lea ($out,&DWP(16,$out));
765 &pshufb ($inout0,$inout3);
767 &sub ($len,1);
768 &jz (&label("ccm64_dec_break"));
770 &$movekey ($rndkey0,&QWP(0,$key_));
771 &mov ($rounds,$rounds_);
772 &$movekey ($rndkey1,&QWP(16,$key_));
773 &xorps ($in0,$rndkey0);
774 &xorps ($inout0,$rndkey0);
775 &xorps ($cmac,$in0); # cmac^=out
776 &$movekey ($rndkey0,&QWP(32,$key_));
778 &set_label("ccm64_dec2_loop");
779 &aesenc ($inout0,$rndkey1);
780 &aesenc ($cmac,$rndkey1);
781 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
782 &add ($rounds,32);
783 &aesenc ($inout0,$rndkey0);
784 &aesenc ($cmac,$rndkey0);
785 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
786 &jnz (&label("ccm64_dec2_loop"));
787 &movups ($in0,&QWP(0,$inp)); # load inp
788 &paddq ($ivec,&QWP(16,"esp"));
789 &aesenc ($inout0,$rndkey1);
790 &aesenc ($cmac,$rndkey1);
791 &aesenclast ($inout0,$rndkey0);
792 &aesenclast ($cmac,$rndkey0);
793 &lea ($inp,&QWP(16,$inp));
794 &jmp (&label("ccm64_dec_outer"));
796 &set_label("ccm64_dec_break",16);
797 &mov ($rounds,&DWP(240,$key_));
798 &mov ($key,$key_);
799 if ($inline)
800 { &aesni_inline_generate1("enc",$cmac,$in0); }
801 else
802 { &call ("_aesni_encrypt1",$cmac); }
804 &mov ("esp",&DWP(48,"esp"));
805 &mov ($out,&wparam(5));
806 &movups (&QWP(0,$out),$cmac);
807 &function_end("aesni_ccm64_decrypt_blocks");
810 ######################################################################
811 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
812 # size_t blocks, const AES_KEY *key,
813 # const char *ivec);
815 # Handles only complete blocks, operates on 32-bit counter and
816 # does not update *ivec! (see crypto/modes/ctr128.c for details)
818 # stack layout:
819 # 0 pshufb mask
820 # 16 vector addend: 0,6,6,6
821 # 32 counter-less ivec
822 # 48 1st triplet of counter vector
823 # 64 2nd triplet of counter vector
824 # 80 saved %esp
826 &function_begin("aesni_ctr32_encrypt_blocks");
827 &mov ($inp,&wparam(0));
828 &mov ($out,&wparam(1));
829 &mov ($len,&wparam(2));
830 &mov ($key,&wparam(3));
831 &mov ($rounds_,&wparam(4));
832 &mov ($key_,"esp");
833 &sub ("esp",88);
834 &and ("esp",-16); # align stack
835 &mov (&DWP(80,"esp"),$key_);
837 &cmp ($len,1);
838 &je (&label("ctr32_one_shortcut"));
840 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
842 # compose byte-swap control mask for pshufb on stack
843 &mov (&DWP(0,"esp"),0x0c0d0e0f);
844 &mov (&DWP(4,"esp"),0x08090a0b);
845 &mov (&DWP(8,"esp"),0x04050607);
846 &mov (&DWP(12,"esp"),0x00010203);
848 # compose counter increment vector on stack
849 &mov ($rounds,6);
850 &xor ($key_,$key_);
851 &mov (&DWP(16,"esp"),$rounds);
852 &mov (&DWP(20,"esp"),$rounds);
853 &mov (&DWP(24,"esp"),$rounds);
854 &mov (&DWP(28,"esp"),$key_);
856 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
857 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
859 &mov ($rounds,&DWP(240,$key)); # key->rounds
861 # compose 2 vectors of 3x32-bit counters
862 &bswap ($rounds_);
863 &pxor ($rndkey0,$rndkey0);
864 &pxor ($rndkey1,$rndkey1);
865 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
866 &pinsrd ($rndkey0,$rounds_,0);
867 &lea ($key_,&DWP(3,$rounds_));
868 &pinsrd ($rndkey1,$key_,0);
869 &inc ($rounds_);
870 &pinsrd ($rndkey0,$rounds_,1);
871 &inc ($key_);
872 &pinsrd ($rndkey1,$key_,1);
873 &inc ($rounds_);
874 &pinsrd ($rndkey0,$rounds_,2);
875 &inc ($key_);
876 &pinsrd ($rndkey1,$key_,2);
877 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
878 &pshufb ($rndkey0,$inout0); # byte swap
879 &movdqu ($inout4,&QWP(0,$key)); # key[0]
880 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
881 &pshufb ($rndkey1,$inout0); # byte swap
883 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
884 &pshufd ($inout1,$rndkey0,2<<6);
885 &cmp ($len,6);
886 &jb (&label("ctr32_tail"));
887 &pxor ($inout5,$inout4); # counter-less ivec^key[0]
888 &shl ($rounds,4);
889 &mov ($rounds_,16);
890 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
891 &mov ($key_,$key); # backup $key
892 &sub ($rounds_,$rounds); # backup twisted $rounds
893 &lea ($key,&DWP(32,$key,$rounds));
894 &sub ($len,6);
895 &jmp (&label("ctr32_loop6"));
897 &set_label("ctr32_loop6",16);
898 # inlining _aesni_encrypt6's prologue gives ~6% improvement...
899 &pshufd ($inout2,$rndkey0,1<<6);
900 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
901 &pshufd ($inout3,$rndkey1,3<<6);
902 &pxor ($inout0,$rndkey0); # merge counter-less ivec
903 &pshufd ($inout4,$rndkey1,2<<6);
904 &pxor ($inout1,$rndkey0);
905 &pshufd ($inout5,$rndkey1,1<<6);
906 &$movekey ($rndkey1,&QWP(16,$key_));
907 &pxor ($inout2,$rndkey0);
908 &pxor ($inout3,$rndkey0);
909 &aesenc ($inout0,$rndkey1);
910 &pxor ($inout4,$rndkey0);
911 &pxor ($inout5,$rndkey0);
912 &aesenc ($inout1,$rndkey1);
913 &$movekey ($rndkey0,&QWP(32,$key_));
914 &mov ($rounds,$rounds_);
915 &aesenc ($inout2,$rndkey1);
916 &aesenc ($inout3,$rndkey1);
917 &aesenc ($inout4,$rndkey1);
918 &aesenc ($inout5,$rndkey1);
920 &call (&label("_aesni_encrypt6_enter"));
922 &movups ($rndkey1,&QWP(0,$inp));
923 &movups ($rndkey0,&QWP(0x10,$inp));
924 &xorps ($inout0,$rndkey1);
925 &movups ($rndkey1,&QWP(0x20,$inp));
926 &xorps ($inout1,$rndkey0);
927 &movups (&QWP(0,$out),$inout0);
928 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
929 &xorps ($inout2,$rndkey1);
930 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
931 &movups (&QWP(0x10,$out),$inout1);
932 &movups (&QWP(0x20,$out),$inout2);
934 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment
935 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
936 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
938 &movups ($inout1,&QWP(0x30,$inp));
939 &movups ($inout2,&QWP(0x40,$inp));
940 &xorps ($inout3,$inout1);
941 &movups ($inout1,&QWP(0x50,$inp));
942 &lea ($inp,&DWP(0x60,$inp));
943 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
944 &pshufb ($rndkey0,$inout0); # byte swap
945 &xorps ($inout4,$inout2);
946 &movups (&QWP(0x30,$out),$inout3);
947 &xorps ($inout5,$inout1);
948 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
949 &pshufb ($rndkey1,$inout0); # byte swap
950 &movups (&QWP(0x40,$out),$inout4);
951 &pshufd ($inout0,$rndkey0,3<<6);
952 &movups (&QWP(0x50,$out),$inout5);
953 &lea ($out,&DWP(0x60,$out));
955 &pshufd ($inout1,$rndkey0,2<<6);
956 &sub ($len,6);
957 &jnc (&label("ctr32_loop6"));
959 &add ($len,6);
960 &jz (&label("ctr32_ret"));
961 &movdqu ($inout5,&QWP(0,$key_));
962 &mov ($key,$key_);
963 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
964 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
966 &set_label("ctr32_tail");
967 &por ($inout0,$inout5);
968 &cmp ($len,2);
969 &jb (&label("ctr32_one"));
971 &pshufd ($inout2,$rndkey0,1<<6);
972 &por ($inout1,$inout5);
973 &je (&label("ctr32_two"));
975 &pshufd ($inout3,$rndkey1,3<<6);
976 &por ($inout2,$inout5);
977 &cmp ($len,4);
978 &jb (&label("ctr32_three"));
980 &pshufd ($inout4,$rndkey1,2<<6);
981 &por ($inout3,$inout5);
982 &je (&label("ctr32_four"));
984 &por ($inout4,$inout5);
985 &call ("_aesni_encrypt6");
986 &movups ($rndkey1,&QWP(0,$inp));
987 &movups ($rndkey0,&QWP(0x10,$inp));
988 &xorps ($inout0,$rndkey1);
989 &movups ($rndkey1,&QWP(0x20,$inp));
990 &xorps ($inout1,$rndkey0);
991 &movups ($rndkey0,&QWP(0x30,$inp));
992 &xorps ($inout2,$rndkey1);
993 &movups ($rndkey1,&QWP(0x40,$inp));
994 &xorps ($inout3,$rndkey0);
995 &movups (&QWP(0,$out),$inout0);
996 &xorps ($inout4,$rndkey1);
997 &movups (&QWP(0x10,$out),$inout1);
998 &movups (&QWP(0x20,$out),$inout2);
999 &movups (&QWP(0x30,$out),$inout3);
1000 &movups (&QWP(0x40,$out),$inout4);
1001 &jmp (&label("ctr32_ret"));
1003 &set_label("ctr32_one_shortcut",16);
1004 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
1005 &mov ($rounds,&DWP(240,$key));
1007 &set_label("ctr32_one");
1008 if ($inline)
1009 { &aesni_inline_generate1("enc"); }
1010 else
1011 { &call ("_aesni_encrypt1"); }
1012 &movups ($in0,&QWP(0,$inp));
1013 &xorps ($in0,$inout0);
1014 &movups (&QWP(0,$out),$in0);
1015 &jmp (&label("ctr32_ret"));
1017 &set_label("ctr32_two",16);
1018 &call ("_aesni_encrypt2");
1019 &movups ($inout3,&QWP(0,$inp));
1020 &movups ($inout4,&QWP(0x10,$inp));
1021 &xorps ($inout0,$inout3);
1022 &xorps ($inout1,$inout4);
1023 &movups (&QWP(0,$out),$inout0);
1024 &movups (&QWP(0x10,$out),$inout1);
1025 &jmp (&label("ctr32_ret"));
1027 &set_label("ctr32_three",16);
1028 &call ("_aesni_encrypt3");
1029 &movups ($inout3,&QWP(0,$inp));
1030 &movups ($inout4,&QWP(0x10,$inp));
1031 &xorps ($inout0,$inout3);
1032 &movups ($inout5,&QWP(0x20,$inp));
1033 &xorps ($inout1,$inout4);
1034 &movups (&QWP(0,$out),$inout0);
1035 &xorps ($inout2,$inout5);
1036 &movups (&QWP(0x10,$out),$inout1);
1037 &movups (&QWP(0x20,$out),$inout2);
1038 &jmp (&label("ctr32_ret"));
1040 &set_label("ctr32_four",16);
1041 &call ("_aesni_encrypt4");
1042 &movups ($inout4,&QWP(0,$inp));
1043 &movups ($inout5,&QWP(0x10,$inp));
1044 &movups ($rndkey1,&QWP(0x20,$inp));
1045 &xorps ($inout0,$inout4);
1046 &movups ($rndkey0,&QWP(0x30,$inp));
1047 &xorps ($inout1,$inout5);
1048 &movups (&QWP(0,$out),$inout0);
1049 &xorps ($inout2,$rndkey1);
1050 &movups (&QWP(0x10,$out),$inout1);
1051 &xorps ($inout3,$rndkey0);
1052 &movups (&QWP(0x20,$out),$inout2);
1053 &movups (&QWP(0x30,$out),$inout3);
1055 &set_label("ctr32_ret");
1056 &mov ("esp",&DWP(80,"esp"));
1057 &function_end("aesni_ctr32_encrypt_blocks");
1059 ######################################################################
1060 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1061 # const AES_KEY *key1, const AES_KEY *key2
1062 # const unsigned char iv[16]);
1064 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1066 &function_begin("aesni_xts_encrypt");
1067 &mov ($key,&wparam(4)); # key2
1068 &mov ($inp,&wparam(5)); # clear-text tweak
1070 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1071 &movups ($inout0,&QWP(0,$inp));
1072 if ($inline)
1073 { &aesni_inline_generate1("enc"); }
1074 else
1075 { &call ("_aesni_encrypt1"); }
1077 &mov ($inp,&wparam(0));
1078 &mov ($out,&wparam(1));
1079 &mov ($len,&wparam(2));
1080 &mov ($key,&wparam(3)); # key1
1082 &mov ($key_,"esp");
1083 &sub ("esp",16*7+8);
1084 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1085 &and ("esp",-16); # align stack
1087 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1088 &mov (&DWP(16*6+4,"esp"),0);
1089 &mov (&DWP(16*6+8,"esp"),1);
1090 &mov (&DWP(16*6+12,"esp"),0);
1091 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1092 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1094 &movdqa ($tweak,$inout0);
1095 &pxor ($twtmp,$twtmp);
1096 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1097 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1099 &and ($len,-16);
1100 &mov ($key_,$key); # backup $key
1101 &mov ($rounds_,$rounds); # backup $rounds
1102 &sub ($len,16*6);
1103 &jc (&label("xts_enc_short"));
1105 &shl ($rounds,4);
1106 &mov ($rounds_,16);
1107 &sub ($rounds_,$rounds);
1108 &lea ($key,&DWP(32,$key,$rounds));
1109 &jmp (&label("xts_enc_loop6"));
1111 &set_label("xts_enc_loop6",16);
1112 for ($i=0;$i<4;$i++) {
1113 &pshufd ($twres,$twtmp,0x13);
1114 &pxor ($twtmp,$twtmp);
1115 &movdqa (&QWP(16*$i,"esp"),$tweak);
1116 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1117 &pand ($twres,$twmask); # isolate carry and residue
1118 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1119 &pxor ($tweak,$twres);
1121 &pshufd ($inout5,$twtmp,0x13);
1122 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1123 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1124 &$movekey ($rndkey0,&QWP(0,$key_));
1125 &pand ($inout5,$twmask); # isolate carry and residue
1126 &movups ($inout0,&QWP(0,$inp)); # load input
1127 &pxor ($inout5,$tweak);
1129 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1130 &mov ($rounds,$rounds_); # restore $rounds
1131 &movdqu ($inout1,&QWP(16*1,$inp));
1132 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1133 &movdqu ($inout2,&QWP(16*2,$inp));
1134 &pxor ($inout1,$rndkey0);
1135 &movdqu ($inout3,&QWP(16*3,$inp));
1136 &pxor ($inout2,$rndkey0);
1137 &movdqu ($inout4,&QWP(16*4,$inp));
1138 &pxor ($inout3,$rndkey0);
1139 &movdqu ($rndkey1,&QWP(16*5,$inp));
1140 &pxor ($inout4,$rndkey0);
1141 &lea ($inp,&DWP(16*6,$inp));
1142 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1143 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1144 &pxor ($inout5,$rndkey1);
1146 &$movekey ($rndkey1,&QWP(16,$key_));
1147 &pxor ($inout1,&QWP(16*1,"esp"));
1148 &pxor ($inout2,&QWP(16*2,"esp"));
1149 &aesenc ($inout0,$rndkey1);
1150 &pxor ($inout3,&QWP(16*3,"esp"));
1151 &pxor ($inout4,&QWP(16*4,"esp"));
1152 &aesenc ($inout1,$rndkey1);
1153 &pxor ($inout5,$rndkey0);
1154 &$movekey ($rndkey0,&QWP(32,$key_));
1155 &aesenc ($inout2,$rndkey1);
1156 &aesenc ($inout3,$rndkey1);
1157 &aesenc ($inout4,$rndkey1);
1158 &aesenc ($inout5,$rndkey1);
1159 &call (&label("_aesni_encrypt6_enter"));
1161 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1162 &pxor ($twtmp,$twtmp);
1163 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1164 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1165 &xorps ($inout1,&QWP(16*1,"esp"));
1166 &movups (&QWP(16*0,$out),$inout0); # write output
1167 &xorps ($inout2,&QWP(16*2,"esp"));
1168 &movups (&QWP(16*1,$out),$inout1);
1169 &xorps ($inout3,&QWP(16*3,"esp"));
1170 &movups (&QWP(16*2,$out),$inout2);
1171 &xorps ($inout4,&QWP(16*4,"esp"));
1172 &movups (&QWP(16*3,$out),$inout3);
1173 &xorps ($inout5,$tweak);
1174 &movups (&QWP(16*4,$out),$inout4);
1175 &pshufd ($twres,$twtmp,0x13);
1176 &movups (&QWP(16*5,$out),$inout5);
1177 &lea ($out,&DWP(16*6,$out));
1178 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1180 &pxor ($twtmp,$twtmp);
1181 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1182 &pand ($twres,$twmask); # isolate carry and residue
1183 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1184 &pxor ($tweak,$twres);
1186 &sub ($len,16*6);
1187 &jnc (&label("xts_enc_loop6"));
1189 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1190 &mov ($key,$key_); # restore $key
1191 &mov ($rounds_,$rounds);
1193 &set_label("xts_enc_short");
1194 &add ($len,16*6);
1195 &jz (&label("xts_enc_done6x"));
1197 &movdqa ($inout3,$tweak); # put aside previous tweak
1198 &cmp ($len,0x20);
1199 &jb (&label("xts_enc_one"));
1201 &pshufd ($twres,$twtmp,0x13);
1202 &pxor ($twtmp,$twtmp);
1203 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1204 &pand ($twres,$twmask); # isolate carry and residue
1205 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1206 &pxor ($tweak,$twres);
1207 &je (&label("xts_enc_two"));
1209 &pshufd ($twres,$twtmp,0x13);
1210 &pxor ($twtmp,$twtmp);
1211 &movdqa ($inout4,$tweak); # put aside previous tweak
1212 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1213 &pand ($twres,$twmask); # isolate carry and residue
1214 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1215 &pxor ($tweak,$twres);
1216 &cmp ($len,0x40);
1217 &jb (&label("xts_enc_three"));
1219 &pshufd ($twres,$twtmp,0x13);
1220 &pxor ($twtmp,$twtmp);
1221 &movdqa ($inout5,$tweak); # put aside previous tweak
1222 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1223 &pand ($twres,$twmask); # isolate carry and residue
1224 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1225 &pxor ($tweak,$twres);
1226 &movdqa (&QWP(16*0,"esp"),$inout3);
1227 &movdqa (&QWP(16*1,"esp"),$inout4);
1228 &je (&label("xts_enc_four"));
1230 &movdqa (&QWP(16*2,"esp"),$inout5);
1231 &pshufd ($inout5,$twtmp,0x13);
1232 &movdqa (&QWP(16*3,"esp"),$tweak);
1233 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1234 &pand ($inout5,$twmask); # isolate carry and residue
1235 &pxor ($inout5,$tweak);
1237 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1238 &movdqu ($inout1,&QWP(16*1,$inp));
1239 &movdqu ($inout2,&QWP(16*2,$inp));
1240 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1241 &movdqu ($inout3,&QWP(16*3,$inp));
1242 &pxor ($inout1,&QWP(16*1,"esp"));
1243 &movdqu ($inout4,&QWP(16*4,$inp));
1244 &pxor ($inout2,&QWP(16*2,"esp"));
1245 &lea ($inp,&DWP(16*5,$inp));
1246 &pxor ($inout3,&QWP(16*3,"esp"));
1247 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1248 &pxor ($inout4,$inout5);
1250 &call ("_aesni_encrypt6");
1252 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1253 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1254 &xorps ($inout1,&QWP(16*1,"esp"));
1255 &xorps ($inout2,&QWP(16*2,"esp"));
1256 &movups (&QWP(16*0,$out),$inout0); # write output
1257 &xorps ($inout3,&QWP(16*3,"esp"));
1258 &movups (&QWP(16*1,$out),$inout1);
1259 &xorps ($inout4,$tweak);
1260 &movups (&QWP(16*2,$out),$inout2);
1261 &movups (&QWP(16*3,$out),$inout3);
1262 &movups (&QWP(16*4,$out),$inout4);
1263 &lea ($out,&DWP(16*5,$out));
1264 &jmp (&label("xts_enc_done"));
1266 &set_label("xts_enc_one",16);
1267 &movups ($inout0,&QWP(16*0,$inp)); # load input
1268 &lea ($inp,&DWP(16*1,$inp));
1269 &xorps ($inout0,$inout3); # input^=tweak
1270 if ($inline)
1271 { &aesni_inline_generate1("enc"); }
1272 else
1273 { &call ("_aesni_encrypt1"); }
1274 &xorps ($inout0,$inout3); # output^=tweak
1275 &movups (&QWP(16*0,$out),$inout0); # write output
1276 &lea ($out,&DWP(16*1,$out));
1278 &movdqa ($tweak,$inout3); # last tweak
1279 &jmp (&label("xts_enc_done"));
1281 &set_label("xts_enc_two",16);
1282 &movaps ($inout4,$tweak); # put aside last tweak
1284 &movups ($inout0,&QWP(16*0,$inp)); # load input
1285 &movups ($inout1,&QWP(16*1,$inp));
1286 &lea ($inp,&DWP(16*2,$inp));
1287 &xorps ($inout0,$inout3); # input^=tweak
1288 &xorps ($inout1,$inout4);
1290 &call ("_aesni_encrypt2");
1292 &xorps ($inout0,$inout3); # output^=tweak
1293 &xorps ($inout1,$inout4);
1294 &movups (&QWP(16*0,$out),$inout0); # write output
1295 &movups (&QWP(16*1,$out),$inout1);
1296 &lea ($out,&DWP(16*2,$out));
1298 &movdqa ($tweak,$inout4); # last tweak
1299 &jmp (&label("xts_enc_done"));
1301 &set_label("xts_enc_three",16);
1302 &movaps ($inout5,$tweak); # put aside last tweak
1303 &movups ($inout0,&QWP(16*0,$inp)); # load input
1304 &movups ($inout1,&QWP(16*1,$inp));
1305 &movups ($inout2,&QWP(16*2,$inp));
1306 &lea ($inp,&DWP(16*3,$inp));
1307 &xorps ($inout0,$inout3); # input^=tweak
1308 &xorps ($inout1,$inout4);
1309 &xorps ($inout2,$inout5);
1311 &call ("_aesni_encrypt3");
1313 &xorps ($inout0,$inout3); # output^=tweak
1314 &xorps ($inout1,$inout4);
1315 &xorps ($inout2,$inout5);
1316 &movups (&QWP(16*0,$out),$inout0); # write output
1317 &movups (&QWP(16*1,$out),$inout1);
1318 &movups (&QWP(16*2,$out),$inout2);
1319 &lea ($out,&DWP(16*3,$out));
1321 &movdqa ($tweak,$inout5); # last tweak
1322 &jmp (&label("xts_enc_done"));
1324 &set_label("xts_enc_four",16);
1325 &movaps ($inout4,$tweak); # put aside last tweak
1327 &movups ($inout0,&QWP(16*0,$inp)); # load input
1328 &movups ($inout1,&QWP(16*1,$inp));
1329 &movups ($inout2,&QWP(16*2,$inp));
1330 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1331 &movups ($inout3,&QWP(16*3,$inp));
1332 &lea ($inp,&DWP(16*4,$inp));
1333 &xorps ($inout1,&QWP(16*1,"esp"));
1334 &xorps ($inout2,$inout5);
1335 &xorps ($inout3,$inout4);
1337 &call ("_aesni_encrypt4");
1339 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1340 &xorps ($inout1,&QWP(16*1,"esp"));
1341 &xorps ($inout2,$inout5);
1342 &movups (&QWP(16*0,$out),$inout0); # write output
1343 &xorps ($inout3,$inout4);
1344 &movups (&QWP(16*1,$out),$inout1);
1345 &movups (&QWP(16*2,$out),$inout2);
1346 &movups (&QWP(16*3,$out),$inout3);
1347 &lea ($out,&DWP(16*4,$out));
1349 &movdqa ($tweak,$inout4); # last tweak
1350 &jmp (&label("xts_enc_done"));
1352 &set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1353 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1354 &and ($len,15);
1355 &jz (&label("xts_enc_ret"));
1356 &movdqa ($inout3,$tweak);
1357 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1358 &jmp (&label("xts_enc_steal"));
1360 &set_label("xts_enc_done",16);
1361 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1362 &pxor ($twtmp,$twtmp);
1363 &and ($len,15);
1364 &jz (&label("xts_enc_ret"));
1366 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1367 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1368 &pshufd ($inout3,$twtmp,0x13);
1369 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1370 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1371 &pxor ($inout3,$tweak);
1373 &set_label("xts_enc_steal");
1374 &movz ($rounds,&BP(0,$inp));
1375 &movz ($key,&BP(-16,$out));
1376 &lea ($inp,&DWP(1,$inp));
1377 &mov (&BP(-16,$out),&LB($rounds));
1378 &mov (&BP(0,$out),&LB($key));
1379 &lea ($out,&DWP(1,$out));
1380 &sub ($len,1);
1381 &jnz (&label("xts_enc_steal"));
1383 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1384 &mov ($key,$key_); # restore $key
1385 &mov ($rounds,$rounds_); # restore $rounds
1387 &movups ($inout0,&QWP(-16,$out)); # load input
1388 &xorps ($inout0,$inout3); # input^=tweak
1389 if ($inline)
1390 { &aesni_inline_generate1("enc"); }
1391 else
1392 { &call ("_aesni_encrypt1"); }
1393 &xorps ($inout0,$inout3); # output^=tweak
1394 &movups (&QWP(-16,$out),$inout0); # write output
1396 &set_label("xts_enc_ret");
1397 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1398 &function_end("aesni_xts_encrypt");
1400 &function_begin("aesni_xts_decrypt");
1401 &mov ($key,&wparam(4)); # key2
1402 &mov ($inp,&wparam(5)); # clear-text tweak
1404 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1405 &movups ($inout0,&QWP(0,$inp));
1406 if ($inline)
1407 { &aesni_inline_generate1("enc"); }
1408 else
1409 { &call ("_aesni_encrypt1"); }
1411 &mov ($inp,&wparam(0));
1412 &mov ($out,&wparam(1));
1413 &mov ($len,&wparam(2));
1414 &mov ($key,&wparam(3)); # key1
1416 &mov ($key_,"esp");
1417 &sub ("esp",16*7+8);
1418 &and ("esp",-16); # align stack
1420 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1421 &test ($len,15);
1422 &setnz (&LB($rounds_));
1423 &shl ($rounds_,4);
1424 &sub ($len,$rounds_);
1426 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1427 &mov (&DWP(16*6+4,"esp"),0);
1428 &mov (&DWP(16*6+8,"esp"),1);
1429 &mov (&DWP(16*6+12,"esp"),0);
1430 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1431 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1433 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1434 &mov ($key_,$key); # backup $key
1435 &mov ($rounds_,$rounds); # backup $rounds
1437 &movdqa ($tweak,$inout0);
1438 &pxor ($twtmp,$twtmp);
1439 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1440 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1442 &and ($len,-16);
1443 &sub ($len,16*6);
1444 &jc (&label("xts_dec_short"));
1446 &shl ($rounds,4);
1447 &mov ($rounds_,16);
1448 &sub ($rounds_,$rounds);
1449 &lea ($key,&DWP(32,$key,$rounds));
1450 &jmp (&label("xts_dec_loop6"));
1452 &set_label("xts_dec_loop6",16);
1453 for ($i=0;$i<4;$i++) {
1454 &pshufd ($twres,$twtmp,0x13);
1455 &pxor ($twtmp,$twtmp);
1456 &movdqa (&QWP(16*$i,"esp"),$tweak);
1457 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1458 &pand ($twres,$twmask); # isolate carry and residue
1459 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1460 &pxor ($tweak,$twres);
1462 &pshufd ($inout5,$twtmp,0x13);
1463 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1464 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1465 &$movekey ($rndkey0,&QWP(0,$key_));
1466 &pand ($inout5,$twmask); # isolate carry and residue
1467 &movups ($inout0,&QWP(0,$inp)); # load input
1468 &pxor ($inout5,$tweak);
1470 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1471 &mov ($rounds,$rounds_);
1472 &movdqu ($inout1,&QWP(16*1,$inp));
1473 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1474 &movdqu ($inout2,&QWP(16*2,$inp));
1475 &pxor ($inout1,$rndkey0);
1476 &movdqu ($inout3,&QWP(16*3,$inp));
1477 &pxor ($inout2,$rndkey0);
1478 &movdqu ($inout4,&QWP(16*4,$inp));
1479 &pxor ($inout3,$rndkey0);
1480 &movdqu ($rndkey1,&QWP(16*5,$inp));
1481 &pxor ($inout4,$rndkey0);
1482 &lea ($inp,&DWP(16*6,$inp));
1483 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1484 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1485 &pxor ($inout5,$rndkey1);
1487 &$movekey ($rndkey1,&QWP(16,$key_));
1488 &pxor ($inout1,&QWP(16*1,"esp"));
1489 &pxor ($inout2,&QWP(16*2,"esp"));
1490 &aesdec ($inout0,$rndkey1);
1491 &pxor ($inout3,&QWP(16*3,"esp"));
1492 &pxor ($inout4,&QWP(16*4,"esp"));
1493 &aesdec ($inout1,$rndkey1);
1494 &pxor ($inout5,$rndkey0);
1495 &$movekey ($rndkey0,&QWP(32,$key_));
1496 &aesdec ($inout2,$rndkey1);
1497 &aesdec ($inout3,$rndkey1);
1498 &aesdec ($inout4,$rndkey1);
1499 &aesdec ($inout5,$rndkey1);
1500 &call (&label("_aesni_decrypt6_enter"));
1502 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1503 &pxor ($twtmp,$twtmp);
1504 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1505 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1506 &xorps ($inout1,&QWP(16*1,"esp"));
1507 &movups (&QWP(16*0,$out),$inout0); # write output
1508 &xorps ($inout2,&QWP(16*2,"esp"));
1509 &movups (&QWP(16*1,$out),$inout1);
1510 &xorps ($inout3,&QWP(16*3,"esp"));
1511 &movups (&QWP(16*2,$out),$inout2);
1512 &xorps ($inout4,&QWP(16*4,"esp"));
1513 &movups (&QWP(16*3,$out),$inout3);
1514 &xorps ($inout5,$tweak);
1515 &movups (&QWP(16*4,$out),$inout4);
1516 &pshufd ($twres,$twtmp,0x13);
1517 &movups (&QWP(16*5,$out),$inout5);
1518 &lea ($out,&DWP(16*6,$out));
1519 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1521 &pxor ($twtmp,$twtmp);
1522 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1523 &pand ($twres,$twmask); # isolate carry and residue
1524 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1525 &pxor ($tweak,$twres);
1527 &sub ($len,16*6);
1528 &jnc (&label("xts_dec_loop6"));
1530 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1531 &mov ($key,$key_); # restore $key
1532 &mov ($rounds_,$rounds);
1534 &set_label("xts_dec_short");
1535 &add ($len,16*6);
1536 &jz (&label("xts_dec_done6x"));
1538 &movdqa ($inout3,$tweak); # put aside previous tweak
1539 &cmp ($len,0x20);
1540 &jb (&label("xts_dec_one"));
1542 &pshufd ($twres,$twtmp,0x13);
1543 &pxor ($twtmp,$twtmp);
1544 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1545 &pand ($twres,$twmask); # isolate carry and residue
1546 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1547 &pxor ($tweak,$twres);
1548 &je (&label("xts_dec_two"));
1550 &pshufd ($twres,$twtmp,0x13);
1551 &pxor ($twtmp,$twtmp);
1552 &movdqa ($inout4,$tweak); # put aside previous tweak
1553 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1554 &pand ($twres,$twmask); # isolate carry and residue
1555 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1556 &pxor ($tweak,$twres);
1557 &cmp ($len,0x40);
1558 &jb (&label("xts_dec_three"));
1560 &pshufd ($twres,$twtmp,0x13);
1561 &pxor ($twtmp,$twtmp);
1562 &movdqa ($inout5,$tweak); # put aside previous tweak
1563 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1564 &pand ($twres,$twmask); # isolate carry and residue
1565 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1566 &pxor ($tweak,$twres);
1567 &movdqa (&QWP(16*0,"esp"),$inout3);
1568 &movdqa (&QWP(16*1,"esp"),$inout4);
1569 &je (&label("xts_dec_four"));
1571 &movdqa (&QWP(16*2,"esp"),$inout5);
1572 &pshufd ($inout5,$twtmp,0x13);
1573 &movdqa (&QWP(16*3,"esp"),$tweak);
1574 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1575 &pand ($inout5,$twmask); # isolate carry and residue
1576 &pxor ($inout5,$tweak);
1578 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1579 &movdqu ($inout1,&QWP(16*1,$inp));
1580 &movdqu ($inout2,&QWP(16*2,$inp));
1581 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1582 &movdqu ($inout3,&QWP(16*3,$inp));
1583 &pxor ($inout1,&QWP(16*1,"esp"));
1584 &movdqu ($inout4,&QWP(16*4,$inp));
1585 &pxor ($inout2,&QWP(16*2,"esp"));
1586 &lea ($inp,&DWP(16*5,$inp));
1587 &pxor ($inout3,&QWP(16*3,"esp"));
1588 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1589 &pxor ($inout4,$inout5);
1591 &call ("_aesni_decrypt6");
1593 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1594 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1595 &xorps ($inout1,&QWP(16*1,"esp"));
1596 &xorps ($inout2,&QWP(16*2,"esp"));
1597 &movups (&QWP(16*0,$out),$inout0); # write output
1598 &xorps ($inout3,&QWP(16*3,"esp"));
1599 &movups (&QWP(16*1,$out),$inout1);
1600 &xorps ($inout4,$tweak);
1601 &movups (&QWP(16*2,$out),$inout2);
1602 &movups (&QWP(16*3,$out),$inout3);
1603 &movups (&QWP(16*4,$out),$inout4);
1604 &lea ($out,&DWP(16*5,$out));
1605 &jmp (&label("xts_dec_done"));
1607 &set_label("xts_dec_one",16);
1608 &movups ($inout0,&QWP(16*0,$inp)); # load input
1609 &lea ($inp,&DWP(16*1,$inp));
1610 &xorps ($inout0,$inout3); # input^=tweak
1611 if ($inline)
1612 { &aesni_inline_generate1("dec"); }
1613 else
1614 { &call ("_aesni_decrypt1"); }
1615 &xorps ($inout0,$inout3); # output^=tweak
1616 &movups (&QWP(16*0,$out),$inout0); # write output
1617 &lea ($out,&DWP(16*1,$out));
1619 &movdqa ($tweak,$inout3); # last tweak
1620 &jmp (&label("xts_dec_done"));
1622 &set_label("xts_dec_two",16);
1623 &movaps ($inout4,$tweak); # put aside last tweak
1625 &movups ($inout0,&QWP(16*0,$inp)); # load input
1626 &movups ($inout1,&QWP(16*1,$inp));
1627 &lea ($inp,&DWP(16*2,$inp));
1628 &xorps ($inout0,$inout3); # input^=tweak
1629 &xorps ($inout1,$inout4);
1631 &call ("_aesni_decrypt2");
1633 &xorps ($inout0,$inout3); # output^=tweak
1634 &xorps ($inout1,$inout4);
1635 &movups (&QWP(16*0,$out),$inout0); # write output
1636 &movups (&QWP(16*1,$out),$inout1);
1637 &lea ($out,&DWP(16*2,$out));
1639 &movdqa ($tweak,$inout4); # last tweak
1640 &jmp (&label("xts_dec_done"));
1642 &set_label("xts_dec_three",16);
1643 &movaps ($inout5,$tweak); # put aside last tweak
1644 &movups ($inout0,&QWP(16*0,$inp)); # load input
1645 &movups ($inout1,&QWP(16*1,$inp));
1646 &movups ($inout2,&QWP(16*2,$inp));
1647 &lea ($inp,&DWP(16*3,$inp));
1648 &xorps ($inout0,$inout3); # input^=tweak
1649 &xorps ($inout1,$inout4);
1650 &xorps ($inout2,$inout5);
1652 &call ("_aesni_decrypt3");
1654 &xorps ($inout0,$inout3); # output^=tweak
1655 &xorps ($inout1,$inout4);
1656 &xorps ($inout2,$inout5);
1657 &movups (&QWP(16*0,$out),$inout0); # write output
1658 &movups (&QWP(16*1,$out),$inout1);
1659 &movups (&QWP(16*2,$out),$inout2);
1660 &lea ($out,&DWP(16*3,$out));
1662 &movdqa ($tweak,$inout5); # last tweak
1663 &jmp (&label("xts_dec_done"));
1665 &set_label("xts_dec_four",16);
1666 &movaps ($inout4,$tweak); # put aside last tweak
1668 &movups ($inout0,&QWP(16*0,$inp)); # load input
1669 &movups ($inout1,&QWP(16*1,$inp));
1670 &movups ($inout2,&QWP(16*2,$inp));
1671 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1672 &movups ($inout3,&QWP(16*3,$inp));
1673 &lea ($inp,&DWP(16*4,$inp));
1674 &xorps ($inout1,&QWP(16*1,"esp"));
1675 &xorps ($inout2,$inout5);
1676 &xorps ($inout3,$inout4);
1678 &call ("_aesni_decrypt4");
1680 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1681 &xorps ($inout1,&QWP(16*1,"esp"));
1682 &xorps ($inout2,$inout5);
1683 &movups (&QWP(16*0,$out),$inout0); # write output
1684 &xorps ($inout3,$inout4);
1685 &movups (&QWP(16*1,$out),$inout1);
1686 &movups (&QWP(16*2,$out),$inout2);
1687 &movups (&QWP(16*3,$out),$inout3);
1688 &lea ($out,&DWP(16*4,$out));
1690 &movdqa ($tweak,$inout4); # last tweak
1691 &jmp (&label("xts_dec_done"));
1693 &set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1694 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1695 &and ($len,15);
1696 &jz (&label("xts_dec_ret"));
1697 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1698 &jmp (&label("xts_dec_only_one_more"));
1700 &set_label("xts_dec_done",16);
1701 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1702 &pxor ($twtmp,$twtmp);
1703 &and ($len,15);
1704 &jz (&label("xts_dec_ret"));
1706 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1707 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1708 &pshufd ($twres,$twtmp,0x13);
1709 &pxor ($twtmp,$twtmp);
1710 &movdqa ($twmask,&QWP(16*6,"esp"));
1711 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1712 &pand ($twres,$twmask); # isolate carry and residue
1713 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1714 &pxor ($tweak,$twres);
1716 &set_label("xts_dec_only_one_more");
1717 &pshufd ($inout3,$twtmp,0x13);
1718 &movdqa ($inout4,$tweak); # put aside previous tweak
1719 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1720 &pand ($inout3,$twmask); # isolate carry and residue
1721 &pxor ($inout3,$tweak);
1723 &mov ($key,$key_); # restore $key
1724 &mov ($rounds,$rounds_); # restore $rounds
1726 &movups ($inout0,&QWP(0,$inp)); # load input
1727 &xorps ($inout0,$inout3); # input^=tweak
1728 if ($inline)
1729 { &aesni_inline_generate1("dec"); }
1730 else
1731 { &call ("_aesni_decrypt1"); }
1732 &xorps ($inout0,$inout3); # output^=tweak
1733 &movups (&QWP(0,$out),$inout0); # write output
1735 &set_label("xts_dec_steal");
1736 &movz ($rounds,&BP(16,$inp));
1737 &movz ($key,&BP(0,$out));
1738 &lea ($inp,&DWP(1,$inp));
1739 &mov (&BP(0,$out),&LB($rounds));
1740 &mov (&BP(16,$out),&LB($key));
1741 &lea ($out,&DWP(1,$out));
1742 &sub ($len,1);
1743 &jnz (&label("xts_dec_steal"));
1745 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1746 &mov ($key,$key_); # restore $key
1747 &mov ($rounds,$rounds_); # restore $rounds
1749 &movups ($inout0,&QWP(0,$out)); # load input
1750 &xorps ($inout0,$inout4); # input^=tweak
1751 if ($inline)
1752 { &aesni_inline_generate1("dec"); }
1753 else
1754 { &call ("_aesni_decrypt1"); }
1755 &xorps ($inout0,$inout4); # output^=tweak
1756 &movups (&QWP(0,$out),$inout0); # write output
1758 &set_label("xts_dec_ret");
1759 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1760 &function_end("aesni_xts_decrypt");
1764 ######################################################################
1765 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
1766 # size_t length, const AES_KEY *key,
1767 # unsigned char *ivp,const int enc);
1768 &function_begin("${PREFIX}_cbc_encrypt");
1769 &mov ($inp,&wparam(0));
1770 &mov ($rounds_,"esp");
1771 &mov ($out,&wparam(1));
1772 &sub ($rounds_,24);
1773 &mov ($len,&wparam(2));
1774 &and ($rounds_,-16);
1775 &mov ($key,&wparam(3));
1776 &mov ($key_,&wparam(4));
1777 &test ($len,$len);
1778 &jz (&label("cbc_abort"));
1780 &cmp (&wparam(5),0);
1781 &xchg ($rounds_,"esp"); # alloca
1782 &movups ($ivec,&QWP(0,$key_)); # load IV
1783 &mov ($rounds,&DWP(240,$key));
1784 &mov ($key_,$key); # backup $key
1785 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
1786 &mov ($rounds_,$rounds); # backup $rounds
1787 &je (&label("cbc_decrypt"));
1789 &movaps ($inout0,$ivec);
1790 &cmp ($len,16);
1791 &jb (&label("cbc_enc_tail"));
1792 &sub ($len,16);
1793 &jmp (&label("cbc_enc_loop"));
1795 &set_label("cbc_enc_loop",16);
1796 &movups ($ivec,&QWP(0,$inp)); # input actually
1797 &lea ($inp,&DWP(16,$inp));
1798 if ($inline)
1799 { &aesni_inline_generate1("enc",$inout0,$ivec); }
1800 else
1801 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
1802 &mov ($rounds,$rounds_); # restore $rounds
1803 &mov ($key,$key_); # restore $key
1804 &movups (&QWP(0,$out),$inout0); # store output
1805 &lea ($out,&DWP(16,$out));
1806 &sub ($len,16);
1807 &jnc (&label("cbc_enc_loop"));
1808 &add ($len,16);
1809 &jnz (&label("cbc_enc_tail"));
1810 &movaps ($ivec,$inout0);
1811 &jmp (&label("cbc_ret"));
1813 &set_label("cbc_enc_tail");
1814 &mov ("ecx",$len); # zaps $rounds
1815 &data_word(0xA4F3F689); # rep movsb
1816 &mov ("ecx",16); # zero tail
1817 &sub ("ecx",$len);
1818 &xor ("eax","eax"); # zaps $len
1819 &data_word(0xAAF3F689); # rep stosb
1820 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
1821 &mov ($rounds,$rounds_); # restore $rounds
1822 &mov ($inp,$out); # $inp and $out are the same
1823 &mov ($key,$key_); # restore $key
1824 &jmp (&label("cbc_enc_loop"));
1825 ######################################################################
1826 &set_label("cbc_decrypt",16);
1827 &cmp ($len,0x50);
1828 &jbe (&label("cbc_dec_tail"));
1829 &movaps (&QWP(0,"esp"),$ivec); # save IV
1830 &sub ($len,0x50);
1831 &jmp (&label("cbc_dec_loop6_enter"));
1833 &set_label("cbc_dec_loop6",16);
1834 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
1835 &movups (&QWP(0,$out),$inout5);
1836 &lea ($out,&DWP(0x10,$out));
1837 &set_label("cbc_dec_loop6_enter");
1838 &movdqu ($inout0,&QWP(0,$inp));
1839 &movdqu ($inout1,&QWP(0x10,$inp));
1840 &movdqu ($inout2,&QWP(0x20,$inp));
1841 &movdqu ($inout3,&QWP(0x30,$inp));
1842 &movdqu ($inout4,&QWP(0x40,$inp));
1843 &movdqu ($inout5,&QWP(0x50,$inp));
1845 &call ("_aesni_decrypt6");
1847 &movups ($rndkey1,&QWP(0,$inp));
1848 &movups ($rndkey0,&QWP(0x10,$inp));
1849 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
1850 &xorps ($inout1,$rndkey1);
1851 &movups ($rndkey1,&QWP(0x20,$inp));
1852 &xorps ($inout2,$rndkey0);
1853 &movups ($rndkey0,&QWP(0x30,$inp));
1854 &xorps ($inout3,$rndkey1);
1855 &movups ($rndkey1,&QWP(0x40,$inp));
1856 &xorps ($inout4,$rndkey0);
1857 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
1858 &xorps ($inout5,$rndkey1);
1859 &movups (&QWP(0,$out),$inout0);
1860 &movups (&QWP(0x10,$out),$inout1);
1861 &lea ($inp,&DWP(0x60,$inp));
1862 &movups (&QWP(0x20,$out),$inout2);
1863 &mov ($rounds,$rounds_); # restore $rounds
1864 &movups (&QWP(0x30,$out),$inout3);
1865 &mov ($key,$key_); # restore $key
1866 &movups (&QWP(0x40,$out),$inout4);
1867 &lea ($out,&DWP(0x50,$out));
1868 &sub ($len,0x60);
1869 &ja (&label("cbc_dec_loop6"));
1871 &movaps ($inout0,$inout5);
1872 &movaps ($ivec,$rndkey0);
1873 &add ($len,0x50);
1874 &jle (&label("cbc_dec_tail_collected"));
1875 &movups (&QWP(0,$out),$inout0);
1876 &lea ($out,&DWP(0x10,$out));
1877 &set_label("cbc_dec_tail");
1878 &movups ($inout0,&QWP(0,$inp));
1879 &movaps ($in0,$inout0);
1880 &cmp ($len,0x10);
1881 &jbe (&label("cbc_dec_one"));
1883 &movups ($inout1,&QWP(0x10,$inp));
1884 &movaps ($in1,$inout1);
1885 &cmp ($len,0x20);
1886 &jbe (&label("cbc_dec_two"));
1888 &movups ($inout2,&QWP(0x20,$inp));
1889 &cmp ($len,0x30);
1890 &jbe (&label("cbc_dec_three"));
1892 &movups ($inout3,&QWP(0x30,$inp));
1893 &cmp ($len,0x40);
1894 &jbe (&label("cbc_dec_four"));
1896 &movups ($inout4,&QWP(0x40,$inp));
1897 &movaps (&QWP(0,"esp"),$ivec); # save IV
1898 &movups ($inout0,&QWP(0,$inp));
1899 &xorps ($inout5,$inout5);
1900 &call ("_aesni_decrypt6");
1901 &movups ($rndkey1,&QWP(0,$inp));
1902 &movups ($rndkey0,&QWP(0x10,$inp));
1903 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
1904 &xorps ($inout1,$rndkey1);
1905 &movups ($rndkey1,&QWP(0x20,$inp));
1906 &xorps ($inout2,$rndkey0);
1907 &movups ($rndkey0,&QWP(0x30,$inp));
1908 &xorps ($inout3,$rndkey1);
1909 &movups ($ivec,&QWP(0x40,$inp)); # IV
1910 &xorps ($inout4,$rndkey0);
1911 &movups (&QWP(0,$out),$inout0);
1912 &movups (&QWP(0x10,$out),$inout1);
1913 &movups (&QWP(0x20,$out),$inout2);
1914 &movups (&QWP(0x30,$out),$inout3);
1915 &lea ($out,&DWP(0x40,$out));
1916 &movaps ($inout0,$inout4);
1917 &sub ($len,0x50);
1918 &jmp (&label("cbc_dec_tail_collected"));
1920 &set_label("cbc_dec_one",16);
1921 if ($inline)
1922 { &aesni_inline_generate1("dec"); }
1923 else
1924 { &call ("_aesni_decrypt1"); }
1925 &xorps ($inout0,$ivec);
1926 &movaps ($ivec,$in0);
1927 &sub ($len,0x10);
1928 &jmp (&label("cbc_dec_tail_collected"));
1930 &set_label("cbc_dec_two",16);
1931 &call ("_aesni_decrypt2");
1932 &xorps ($inout0,$ivec);
1933 &xorps ($inout1,$in0);
1934 &movups (&QWP(0,$out),$inout0);
1935 &movaps ($inout0,$inout1);
1936 &lea ($out,&DWP(0x10,$out));
1937 &movaps ($ivec,$in1);
1938 &sub ($len,0x20);
1939 &jmp (&label("cbc_dec_tail_collected"));
1941 &set_label("cbc_dec_three",16);
1942 &call ("_aesni_decrypt3");
1943 &xorps ($inout0,$ivec);
1944 &xorps ($inout1,$in0);
1945 &xorps ($inout2,$in1);
1946 &movups (&QWP(0,$out),$inout0);
1947 &movaps ($inout0,$inout2);
1948 &movups (&QWP(0x10,$out),$inout1);
1949 &lea ($out,&DWP(0x20,$out));
1950 &movups ($ivec,&QWP(0x20,$inp));
1951 &sub ($len,0x30);
1952 &jmp (&label("cbc_dec_tail_collected"));
1954 &set_label("cbc_dec_four",16);
1955 &call ("_aesni_decrypt4");
1956 &movups ($rndkey1,&QWP(0x10,$inp));
1957 &movups ($rndkey0,&QWP(0x20,$inp));
1958 &xorps ($inout0,$ivec);
1959 &movups ($ivec,&QWP(0x30,$inp));
1960 &xorps ($inout1,$in0);
1961 &movups (&QWP(0,$out),$inout0);
1962 &xorps ($inout2,$rndkey1);
1963 &movups (&QWP(0x10,$out),$inout1);
1964 &xorps ($inout3,$rndkey0);
1965 &movups (&QWP(0x20,$out),$inout2);
1966 &lea ($out,&DWP(0x30,$out));
1967 &movaps ($inout0,$inout3);
1968 &sub ($len,0x40);
1970 &set_label("cbc_dec_tail_collected");
1971 &and ($len,15);
1972 &jnz (&label("cbc_dec_tail_partial"));
1973 &movups (&QWP(0,$out),$inout0);
1974 &jmp (&label("cbc_ret"));
1976 &set_label("cbc_dec_tail_partial",16);
1977 &movaps (&QWP(0,"esp"),$inout0);
1978 &mov ("ecx",16);
1979 &mov ($inp,"esp");
1980 &sub ("ecx",$len);
1981 &data_word(0xA4F3F689); # rep movsb
1983 &set_label("cbc_ret");
1984 &mov ("esp",&DWP(16,"esp")); # pull original %esp
1985 &mov ($key_,&wparam(4));
1986 &movups (&QWP(0,$key_),$ivec); # output IV
1987 &set_label("cbc_abort");
1988 &function_end("${PREFIX}_cbc_encrypt");
1990 ######################################################################
1991 # Mechanical port from aesni-x86_64.pl.
1993 # _aesni_set_encrypt_key is private interface,
1994 # input:
1995 # "eax" const unsigned char *userKey
1996 # $rounds int bits
1997 # $key AES_KEY *key
1998 # output:
1999 # "eax" return code
2000 # $round rounds
2002 &function_begin_B("_aesni_set_encrypt_key");
2003 &test ("eax","eax");
2004 &jz (&label("bad_pointer"));
2005 &test ($key,$key);
2006 &jz (&label("bad_pointer"));
2008 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
2009 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
2010 &lea ($key,&DWP(16,$key));
2011 &cmp ($rounds,256);
2012 &je (&label("14rounds"));
2013 &cmp ($rounds,192);
2014 &je (&label("12rounds"));
2015 &cmp ($rounds,128);
2016 &jne (&label("bad_keybits"));
2018 &set_label("10rounds",16);
2019 &mov ($rounds,9);
2020 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
2021 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
2022 &call (&label("key_128_cold"));
2023 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
2024 &call (&label("key_128"));
2025 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
2026 &call (&label("key_128"));
2027 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
2028 &call (&label("key_128"));
2029 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
2030 &call (&label("key_128"));
2031 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
2032 &call (&label("key_128"));
2033 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
2034 &call (&label("key_128"));
2035 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
2036 &call (&label("key_128"));
2037 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
2038 &call (&label("key_128"));
2039 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
2040 &call (&label("key_128"));
2041 &$movekey (&QWP(0,$key),"xmm0");
2042 &mov (&DWP(80,$key),$rounds);
2043 &xor ("eax","eax");
2044 &ret();
2046 &set_label("key_128",16);
2047 &$movekey (&QWP(0,$key),"xmm0");
2048 &lea ($key,&DWP(16,$key));
2049 &set_label("key_128_cold");
2050 &shufps ("xmm4","xmm0",0b00010000);
2051 &xorps ("xmm0","xmm4");
2052 &shufps ("xmm4","xmm0",0b10001100);
2053 &xorps ("xmm0","xmm4");
2054 &shufps ("xmm1","xmm1",0b11111111); # critical path
2055 &xorps ("xmm0","xmm1");
2056 &ret();
2058 &set_label("12rounds",16);
2059 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
2060 &mov ($rounds,11);
2061 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
2062 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
2063 &call (&label("key_192a_cold"));
2064 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
2065 &call (&label("key_192b"));
2066 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
2067 &call (&label("key_192a"));
2068 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
2069 &call (&label("key_192b"));
2070 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
2071 &call (&label("key_192a"));
2072 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
2073 &call (&label("key_192b"));
2074 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
2075 &call (&label("key_192a"));
2076 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
2077 &call (&label("key_192b"));
2078 &$movekey (&QWP(0,$key),"xmm0");
2079 &mov (&DWP(48,$key),$rounds);
2080 &xor ("eax","eax");
2081 &ret();
2083 &set_label("key_192a",16);
2084 &$movekey (&QWP(0,$key),"xmm0");
2085 &lea ($key,&DWP(16,$key));
2086 &set_label("key_192a_cold",16);
2087 &movaps ("xmm5","xmm2");
2088 &set_label("key_192b_warm");
2089 &shufps ("xmm4","xmm0",0b00010000);
2090 &movdqa ("xmm3","xmm2");
2091 &xorps ("xmm0","xmm4");
2092 &shufps ("xmm4","xmm0",0b10001100);
2093 &pslldq ("xmm3",4);
2094 &xorps ("xmm0","xmm4");
2095 &pshufd ("xmm1","xmm1",0b01010101); # critical path
2096 &pxor ("xmm2","xmm3");
2097 &pxor ("xmm0","xmm1");
2098 &pshufd ("xmm3","xmm0",0b11111111);
2099 &pxor ("xmm2","xmm3");
2100 &ret();
2102 &set_label("key_192b",16);
2103 &movaps ("xmm3","xmm0");
2104 &shufps ("xmm5","xmm0",0b01000100);
2105 &$movekey (&QWP(0,$key),"xmm5");
2106 &shufps ("xmm3","xmm2",0b01001110);
2107 &$movekey (&QWP(16,$key),"xmm3");
2108 &lea ($key,&DWP(32,$key));
2109 &jmp (&label("key_192b_warm"));
2111 &set_label("14rounds",16);
2112 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
2113 &mov ($rounds,13);
2114 &lea ($key,&DWP(16,$key));
2115 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
2116 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
2117 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
2118 &call (&label("key_256a_cold"));
2119 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
2120 &call (&label("key_256b"));
2121 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
2122 &call (&label("key_256a"));
2123 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
2124 &call (&label("key_256b"));
2125 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
2126 &call (&label("key_256a"));
2127 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
2128 &call (&label("key_256b"));
2129 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
2130 &call (&label("key_256a"));
2131 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
2132 &call (&label("key_256b"));
2133 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
2134 &call (&label("key_256a"));
2135 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
2136 &call (&label("key_256b"));
2137 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
2138 &call (&label("key_256a"));
2139 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
2140 &call (&label("key_256b"));
2141 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
2142 &call (&label("key_256a"));
2143 &$movekey (&QWP(0,$key),"xmm0");
2144 &mov (&DWP(16,$key),$rounds);
2145 &xor ("eax","eax");
2146 &ret();
2148 &set_label("key_256a",16);
2149 &$movekey (&QWP(0,$key),"xmm2");
2150 &lea ($key,&DWP(16,$key));
2151 &set_label("key_256a_cold");
2152 &shufps ("xmm4","xmm0",0b00010000);
2153 &xorps ("xmm0","xmm4");
2154 &shufps ("xmm4","xmm0",0b10001100);
2155 &xorps ("xmm0","xmm4");
2156 &shufps ("xmm1","xmm1",0b11111111); # critical path
2157 &xorps ("xmm0","xmm1");
2158 &ret();
2160 &set_label("key_256b",16);
2161 &$movekey (&QWP(0,$key),"xmm0");
2162 &lea ($key,&DWP(16,$key));
2164 &shufps ("xmm4","xmm2",0b00010000);
2165 &xorps ("xmm2","xmm4");
2166 &shufps ("xmm4","xmm2",0b10001100);
2167 &xorps ("xmm2","xmm4");
2168 &shufps ("xmm1","xmm1",0b10101010); # critical path
2169 &xorps ("xmm2","xmm1");
2170 &ret();
2172 &set_label("bad_pointer",4);
2173 &mov ("eax",-1);
2174 &ret ();
2175 &set_label("bad_keybits",4);
2176 &mov ("eax",-2);
2177 &ret ();
2178 &function_end_B("_aesni_set_encrypt_key");
2180 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2181 # AES_KEY *key)
2182 &function_begin_B("${PREFIX}_set_encrypt_key");
2183 &mov ("eax",&wparam(0));
2184 &mov ($rounds,&wparam(1));
2185 &mov ($key,&wparam(2));
2186 &call ("_aesni_set_encrypt_key");
2187 &ret ();
2188 &function_end_B("${PREFIX}_set_encrypt_key");
2190 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2191 # AES_KEY *key)
2192 &function_begin_B("${PREFIX}_set_decrypt_key");
2193 &mov ("eax",&wparam(0));
2194 &mov ($rounds,&wparam(1));
2195 &mov ($key,&wparam(2));
2196 &call ("_aesni_set_encrypt_key");
2197 &mov ($key,&wparam(2));
2198 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key
2199 &test ("eax","eax");
2200 &jnz (&label("dec_key_ret"));
2201 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
2203 &$movekey ("xmm0",&QWP(0,$key)); # just swap
2204 &$movekey ("xmm1",&QWP(0,"eax"));
2205 &$movekey (&QWP(0,"eax"),"xmm0");
2206 &$movekey (&QWP(0,$key),"xmm1");
2207 &lea ($key,&DWP(16,$key));
2208 &lea ("eax",&DWP(-16,"eax"));
2210 &set_label("dec_key_inverse");
2211 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
2212 &$movekey ("xmm1",&QWP(0,"eax"));
2213 &aesimc ("xmm0","xmm0");
2214 &aesimc ("xmm1","xmm1");
2215 &lea ($key,&DWP(16,$key));
2216 &lea ("eax",&DWP(-16,"eax"));
2217 &$movekey (&QWP(16,"eax"),"xmm0");
2218 &$movekey (&QWP(-16,$key),"xmm1");
2219 &cmp ("eax",$key);
2220 &ja (&label("dec_key_inverse"));
2222 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
2223 &aesimc ("xmm0","xmm0");
2224 &$movekey (&QWP(0,$key),"xmm0");
2226 &xor ("eax","eax"); # return success
2227 &set_label("dec_key_ret");
2228 &ret ();
2229 &function_end_B("${PREFIX}_set_decrypt_key");
2230 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
2232 &asm_finish();