OpenSSL: Update to version 1.0.1e
[tomato.git] / release / src / router / openssl / crypto / camellia / asm / cmll-x86_64.pl
blob9f4b82fa48219540f398e483a8a1dc852868e328
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
6 # This module may be used under the terms of either the GNU General
7 # Public License version 2 or later, the GNU Lesser General Public
8 # License version 2.1 or later, the Mozilla Public License version
9 # 1.1 or the BSD License. The exact terms of either license are
10 # distributed along with this module. For further details see
11 # http://www.openssl.org/~appro/camellia/.
12 # ====================================================================
14 # Performance in cycles per processed byte (less is better) in
15 # 'openssl speed ...' benchmark:
17 # AMD64 Core2 EM64T
18 # -evp camellia-128-ecb 16.7 21.0 22.7
19 # + over gcc 3.4.6 +25% +5% 0%
21 # camellia-128-cbc 15.7 20.4 21.1
23 # 128-bit key setup 128 216 205 cycles/key
24 # + over gcc 3.4.6 +54% +39% +15%
26 # Numbers in "+" rows represent performance improvement over compiler
27 # generated code. Key setup timings are impressive on AMD and Core2
28 # thanks to 64-bit operations being covertly deployed. Improvement on
29 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
30 # apparently emulates some of 64-bit operations in [32-bit] microcode.
32 $flavour = shift;
33 $output = shift;
34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41 die "can't locate x86_64-xlate.pl";
43 open OUT,"| \"$^X\" $xlate $flavour $output";
44 *STDOUT=*OUT;
46 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
47 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
48 $r =~ s/%[er]([sd]i)/%\1l/;
49 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
51 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
52 @S=("%r8d","%r9d","%r10d","%r11d");
53 $i0="%esi";
54 $i1="%edi";
55 $Tbl="%rbp"; # size optimization
56 $inp="%r12";
57 $out="%r13";
58 $key="%r14";
59 $keyend="%r15";
60 $arg0d=$win64?"%ecx":"%edi";
62 # const unsigned int Camellia_SBOX[4][256];
63 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
64 # and [2][] - with [3][]. This is done to minimize code size.
65 $SBOX1_1110=0; # Camellia_SBOX[0]
66 $SBOX4_4404=4; # Camellia_SBOX[1]
67 $SBOX2_0222=2048; # Camellia_SBOX[2]
68 $SBOX3_3033=2052; # Camellia_SBOX[3]
70 sub Camellia_Feistel {
71 my $i=@_[0];
72 my $seed=defined(@_[1])?@_[1]:0;
73 my $scale=$seed<0?-8:8;
74 my $j=($i&1)*2;
75 my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
77 $code.=<<___;
78 xor $s0,$t0 # t0^=key[0]
79 xor $s1,$t1 # t1^=key[1]
80 movz `&hi("$t0")`,$i0 # (t0>>8)&0xff
81 movz `&lo("$t1")`,$i1 # (t1>>0)&0xff
82 mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0]
83 mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1]
84 movz `&lo("$t0")`,$i0 # (t0>>0)&0xff
85 shr \$16,$t0
86 movz `&hi("$t1")`,$i1 # (t1>>8)&0xff
87 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0]
88 shr \$16,$t1
89 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1]
90 movz `&hi("$t0")`,$i0 # (t0>>24)&0xff
91 movz `&lo("$t1")`,$i1 # (t1>>16)&0xff
92 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0]
93 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1]
94 movz `&lo("$t0")`,$i0 # (t0>>16)&0xff
95 movz `&hi("$t1")`,$i1 # (t1>>24)&0xff
96 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0]
97 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1]
98 mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
99 mov `$seed+($i+1)*$scale+4`($key),$t0
100 xor $t3,$t2 # t2^=t3
101 ror \$8,$t3 # t3=RightRotate(t3,8)
102 xor $t2,$s2
103 xor $t2,$s3
104 xor $t3,$s3
108 # void Camellia_EncryptBlock_Rounds(
109 # int grandRounds,
110 # const Byte plaintext[],
111 # const KEY_TABLE_TYPE keyTable,
112 # Byte ciphertext[])
113 $code=<<___;
114 .text
116 # V1.x API
117 .globl Camellia_EncryptBlock
118 .type Camellia_EncryptBlock,\@abi-omnipotent
119 .align 16
120 Camellia_EncryptBlock:
121 movl \$128,%eax
122 subl $arg0d,%eax
123 movl \$3,$arg0d
124 adcl \$0,$arg0d # keyBitLength==128?3:4
125 jmp .Lenc_rounds
126 .size Camellia_EncryptBlock,.-Camellia_EncryptBlock
127 # V2
128 .globl Camellia_EncryptBlock_Rounds
129 .type Camellia_EncryptBlock_Rounds,\@function,4
130 .align 16
131 .Lenc_rounds:
132 Camellia_EncryptBlock_Rounds:
133 push %rbx
134 push %rbp
135 push %r13
136 push %r14
137 push %r15
138 .Lenc_prologue:
140 #mov %rsi,$inp # put away arguments
141 mov %rcx,$out
142 mov %rdx,$key
144 shl \$6,%edi # process grandRounds
145 lea .LCamellia_SBOX(%rip),$Tbl
146 lea ($key,%rdi),$keyend
148 mov 0(%rsi),@S[0] # load plaintext
149 mov 4(%rsi),@S[1]
150 mov 8(%rsi),@S[2]
151 bswap @S[0]
152 mov 12(%rsi),@S[3]
153 bswap @S[1]
154 bswap @S[2]
155 bswap @S[3]
157 call _x86_64_Camellia_encrypt
159 bswap @S[0]
160 bswap @S[1]
161 bswap @S[2]
162 mov @S[0],0($out)
163 bswap @S[3]
164 mov @S[1],4($out)
165 mov @S[2],8($out)
166 mov @S[3],12($out)
168 mov 0(%rsp),%r15
169 mov 8(%rsp),%r14
170 mov 16(%rsp),%r13
171 mov 24(%rsp),%rbp
172 mov 32(%rsp),%rbx
173 lea 40(%rsp),%rsp
174 .Lenc_epilogue:
176 .size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
178 .type _x86_64_Camellia_encrypt,\@abi-omnipotent
179 .align 16
180 _x86_64_Camellia_encrypt:
181 xor 0($key),@S[1]
182 xor 4($key),@S[0] # ^=key[0-3]
183 xor 8($key),@S[3]
184 xor 12($key),@S[2]
185 .align 16
186 .Leloop:
187 mov 16($key),$t1 # prefetch key[4-5]
188 mov 20($key),$t0
191 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
192 $code.=<<___;
193 lea 16*4($key),$key
194 cmp $keyend,$key
195 mov 8($key),$t3 # prefetch key[2-3]
196 mov 12($key),$t2
197 je .Ledone
199 and @S[0],$t0
200 or @S[3],$t3
201 rol \$1,$t0
202 xor $t3,@S[2] # s2^=s3|key[3];
203 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
204 and @S[2],$t2
205 or @S[1],$t1
206 rol \$1,$t2
207 xor $t1,@S[0] # s0^=s1|key[1];
208 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
209 jmp .Leloop
211 .align 16
212 .Ledone:
213 xor @S[2],$t0 # SwapHalf
214 xor @S[3],$t1
215 xor @S[0],$t2
216 xor @S[1],$t3
218 mov $t0,@S[0]
219 mov $t1,@S[1]
220 mov $t2,@S[2]
221 mov $t3,@S[3]
223 .byte 0xf3,0xc3 # rep ret
224 .size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
226 # V1.x API
227 .globl Camellia_DecryptBlock
228 .type Camellia_DecryptBlock,\@abi-omnipotent
229 .align 16
230 Camellia_DecryptBlock:
231 movl \$128,%eax
232 subl $arg0d,%eax
233 movl \$3,$arg0d
234 adcl \$0,$arg0d # keyBitLength==128?3:4
235 jmp .Ldec_rounds
236 .size Camellia_DecryptBlock,.-Camellia_DecryptBlock
237 # V2
238 .globl Camellia_DecryptBlock_Rounds
239 .type Camellia_DecryptBlock_Rounds,\@function,4
240 .align 16
241 .Ldec_rounds:
242 Camellia_DecryptBlock_Rounds:
243 push %rbx
244 push %rbp
245 push %r13
246 push %r14
247 push %r15
248 .Ldec_prologue:
250 #mov %rsi,$inp # put away arguments
251 mov %rcx,$out
252 mov %rdx,$keyend
254 shl \$6,%edi # process grandRounds
255 lea .LCamellia_SBOX(%rip),$Tbl
256 lea ($keyend,%rdi),$key
258 mov 0(%rsi),@S[0] # load plaintext
259 mov 4(%rsi),@S[1]
260 mov 8(%rsi),@S[2]
261 bswap @S[0]
262 mov 12(%rsi),@S[3]
263 bswap @S[1]
264 bswap @S[2]
265 bswap @S[3]
267 call _x86_64_Camellia_decrypt
269 bswap @S[0]
270 bswap @S[1]
271 bswap @S[2]
272 mov @S[0],0($out)
273 bswap @S[3]
274 mov @S[1],4($out)
275 mov @S[2],8($out)
276 mov @S[3],12($out)
278 mov 0(%rsp),%r15
279 mov 8(%rsp),%r14
280 mov 16(%rsp),%r13
281 mov 24(%rsp),%rbp
282 mov 32(%rsp),%rbx
283 lea 40(%rsp),%rsp
284 .Ldec_epilogue:
286 .size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
288 .type _x86_64_Camellia_decrypt,\@abi-omnipotent
289 .align 16
290 _x86_64_Camellia_decrypt:
291 xor 0($key),@S[1]
292 xor 4($key),@S[0] # ^=key[0-3]
293 xor 8($key),@S[3]
294 xor 12($key),@S[2]
295 .align 16
296 .Ldloop:
297 mov -8($key),$t1 # prefetch key[4-5]
298 mov -4($key),$t0
301 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
302 $code.=<<___;
303 lea -16*4($key),$key
304 cmp $keyend,$key
305 mov 0($key),$t3 # prefetch key[2-3]
306 mov 4($key),$t2
307 je .Lddone
309 and @S[0],$t0
310 or @S[3],$t3
311 rol \$1,$t0
312 xor $t3,@S[2] # s2^=s3|key[3];
313 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
314 and @S[2],$t2
315 or @S[1],$t1
316 rol \$1,$t2
317 xor $t1,@S[0] # s0^=s1|key[1];
318 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
320 jmp .Ldloop
322 .align 16
323 .Lddone:
324 xor @S[2],$t2
325 xor @S[3],$t3
326 xor @S[0],$t0
327 xor @S[1],$t1
329 mov $t2,@S[0] # SwapHalf
330 mov $t3,@S[1]
331 mov $t0,@S[2]
332 mov $t1,@S[3]
334 .byte 0xf3,0xc3 # rep ret
335 .size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
338 sub _saveround {
339 my ($rnd,$key,@T)=@_;
340 my $bias=int(@T[0])?shift(@T):0;
342 if ($#T==3) {
343 $code.=<<___;
344 mov @T[1],`$bias+$rnd*8+0`($key)
345 mov @T[0],`$bias+$rnd*8+4`($key)
346 mov @T[3],`$bias+$rnd*8+8`($key)
347 mov @T[2],`$bias+$rnd*8+12`($key)
349 } else {
350 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n";
351 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
355 sub _loadround {
356 my ($rnd,$key,@T)=@_;
357 my $bias=int(@T[0])?shift(@T):0;
359 $code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n";
360 $code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
363 # shld is very slow on Intel EM64T family. Even on AMD it limits
364 # instruction decode rate [because it's VectorPath] and consequently
365 # performance...
366 sub __rotl128 {
367 my ($i0,$i1,$rot)=@_;
369 if ($rot) {
370 $code.=<<___;
371 mov $i0,%r11
372 shld \$$rot,$i1,$i0
373 shld \$$rot,%r11,$i1
378 # ... Implementing 128-bit rotate without shld gives 80% better
379 # performance EM64T, +15% on AMD64 and only ~7% degradation on
380 # Core2. This is therefore preferred.
381 sub _rotl128 {
382 my ($i0,$i1,$rot)=@_;
384 if ($rot) {
385 $code.=<<___;
386 mov $i0,%r11
387 shl \$$rot,$i0
388 mov $i1,%r9
389 shr \$`64-$rot`,%r9
390 shr \$`64-$rot`,%r11
391 or %r9,$i0
392 shl \$$rot,$i1
393 or %r11,$i1
398 { my $step=0;
400 $code.=<<___;
401 .globl Camellia_Ekeygen
402 .type Camellia_Ekeygen,\@function,3
403 .align 16
404 Camellia_Ekeygen:
405 push %rbx
406 push %rbp
407 push %r13
408 push %r14
409 push %r15
410 .Lkey_prologue:
412 mov %rdi,$keyend # put away arguments, keyBitLength
413 mov %rdx,$out # keyTable
415 mov 0(%rsi),@S[0] # load 0-127 bits
416 mov 4(%rsi),@S[1]
417 mov 8(%rsi),@S[2]
418 mov 12(%rsi),@S[3]
420 bswap @S[0]
421 bswap @S[1]
422 bswap @S[2]
423 bswap @S[3]
425 &_saveround (0,$out,@S); # KL<<<0
426 $code.=<<___;
427 cmp \$128,$keyend # check keyBitLength
428 je .L1st128
430 mov 16(%rsi),@S[0] # load 128-191 bits
431 mov 20(%rsi),@S[1]
432 cmp \$192,$keyend
433 je .L1st192
434 mov 24(%rsi),@S[2] # load 192-255 bits
435 mov 28(%rsi),@S[3]
436 jmp .L1st256
437 .L1st192:
438 mov @S[0],@S[2]
439 mov @S[1],@S[3]
440 not @S[2]
441 not @S[3]
442 .L1st256:
443 bswap @S[0]
444 bswap @S[1]
445 bswap @S[2]
446 bswap @S[3]
448 &_saveround (4,$out,@S); # temp storage for KR!
449 $code.=<<___;
450 xor 0($out),@S[1] # KR^KL
451 xor 4($out),@S[0]
452 xor 8($out),@S[3]
453 xor 12($out),@S[2]
455 .L1st128:
456 lea .LCamellia_SIGMA(%rip),$key
457 lea .LCamellia_SBOX(%rip),$Tbl
459 mov 0($key),$t1
460 mov 4($key),$t0
462 &Camellia_Feistel($step++);
463 &Camellia_Feistel($step++);
464 $code.=<<___;
465 xor 0($out),@S[1] # ^KL
466 xor 4($out),@S[0]
467 xor 8($out),@S[3]
468 xor 12($out),@S[2]
470 &Camellia_Feistel($step++);
471 &Camellia_Feistel($step++);
472 $code.=<<___;
473 cmp \$128,$keyend
474 jne .L2nd256
476 lea 128($out),$out # size optimization
477 shl \$32,%r8 # @S[0]||
478 shl \$32,%r10 # @S[2]||
479 or %r9,%r8 # ||@S[1]
480 or %r11,%r10 # ||@S[3]
482 &_loadround (0,$out,-128,"%rax","%rbx"); # KL
483 &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0
484 &_rotl128 ("%rax","%rbx",15);
485 &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15
486 &_rotl128 ("%r8","%r10",15);
487 &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15
488 &_rotl128 ("%r8","%r10",15); # 15+15=30
489 &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30
490 &_rotl128 ("%rax","%rbx",30); # 15+30=45
491 &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45
492 &_rotl128 ("%r8","%r10",15); # 30+15=45
493 &_saveround (12,$out,-128,"%r8"); # KA<<<45
494 &_rotl128 ("%rax","%rbx",15); # 45+15=60
495 &_saveround (13,$out,-128,"%rbx"); # KL<<<60
496 &_rotl128 ("%r8","%r10",15); # 45+15=60
497 &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60
498 &_rotl128 ("%rax","%rbx",17); # 60+17=77
499 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77
500 &_rotl128 ("%rax","%rbx",17); # 77+17=94
501 &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94
502 &_rotl128 ("%r8","%r10",34); # 60+34=94
503 &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94
504 &_rotl128 ("%rax","%rbx",17); # 94+17=111
505 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111
506 &_rotl128 ("%r8","%r10",17); # 94+17=111
507 &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111
508 $code.=<<___;
509 mov \$3,%eax
510 jmp .Ldone
511 .align 16
512 .L2nd256:
514 &_saveround (6,$out,@S); # temp storage for KA!
515 $code.=<<___;
516 xor `4*8+0`($out),@S[1] # KA^KR
517 xor `4*8+4`($out),@S[0]
518 xor `5*8+0`($out),@S[3]
519 xor `5*8+4`($out),@S[2]
521 &Camellia_Feistel($step++);
522 &Camellia_Feistel($step++);
524 &_loadround (0,$out,"%rax","%rbx"); # KL
525 &_loadround (4,$out,"%rcx","%rdx"); # KR
526 &_loadround (6,$out,"%r14","%r15"); # KA
527 $code.=<<___;
528 lea 128($out),$out # size optimization
529 shl \$32,%r8 # @S[0]||
530 shl \$32,%r10 # @S[2]||
531 or %r9,%r8 # ||@S[1]
532 or %r11,%r10 # ||@S[3]
534 &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0
535 &_rotl128 ("%rcx","%rdx",15);
536 &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15
537 &_rotl128 ("%r14","%r15",15);
538 &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15
539 &_rotl128 ("%rcx","%rdx",15); # 15+15=30
540 &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30
541 &_rotl128 ("%r8","%r10",30);
542 &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30
543 &_rotl128 ("%rax","%rbx",45);
544 &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45
545 &_rotl128 ("%r14","%r15",30); # 15+30=45
546 &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45
547 &_rotl128 ("%rax","%rbx",15); # 45+15=60
548 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60
549 &_rotl128 ("%rcx","%rdx",30); # 30+30=60
550 &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60
551 &_rotl128 ("%r8","%r10",30); # 30+30=60
552 &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60
553 &_rotl128 ("%rax","%rbx",17); # 60+17=77
554 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77
555 &_rotl128 ("%r14","%r15",32); # 45+32=77
556 &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77
557 &_rotl128 ("%rcx","%rdx",34); # 60+34=94
558 &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94
559 &_rotl128 ("%r14","%r15",17); # 77+17=94
560 &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77
561 &_rotl128 ("%rax","%rbx",34); # 77+34=111
562 &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111
563 &_rotl128 ("%r8","%r10",51); # 60+51=111
564 &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111
565 $code.=<<___;
566 mov \$4,%eax
567 .Ldone:
568 mov 0(%rsp),%r15
569 mov 8(%rsp),%r14
570 mov 16(%rsp),%r13
571 mov 24(%rsp),%rbp
572 mov 32(%rsp),%rbx
573 lea 40(%rsp),%rsp
574 .Lkey_epilogue:
576 .size Camellia_Ekeygen,.-Camellia_Ekeygen
580 @SBOX=(
581 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
582 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
583 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
584 166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
585 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
586 223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
587 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
588 254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
589 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
590 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
591 135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
592 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
593 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
594 120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
595 114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
596 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
598 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
599 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
600 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
601 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
603 $code.=<<___;
604 .align 64
605 .LCamellia_SIGMA:
606 .long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
607 .long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
608 .long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
609 .long 0, 0, 0, 0
610 .LCamellia_SBOX:
612 # tables are interleaved, remember?
613 sub data_word { $code.=".long\t".join(',',@_)."\n"; }
614 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
615 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
617 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
618 # size_t length, const CAMELLIA_KEY *key,
619 # unsigned char *ivp,const int enc);
621 $_key="0(%rsp)";
622 $_end="8(%rsp)"; # inp+len&~15
623 $_res="16(%rsp)"; # len&15
624 $ivec="24(%rsp)";
625 $_ivp="40(%rsp)";
626 $_rsp="48(%rsp)";
628 $code.=<<___;
629 .globl Camellia_cbc_encrypt
630 .type Camellia_cbc_encrypt,\@function,6
631 .align 16
632 Camellia_cbc_encrypt:
633 cmp \$0,%rdx
634 je .Lcbc_abort
635 push %rbx
636 push %rbp
637 push %r12
638 push %r13
639 push %r14
640 push %r15
641 .Lcbc_prologue:
643 mov %rsp,%rbp
644 sub \$64,%rsp
645 and \$-64,%rsp
647 # place stack frame just "above mod 1024" the key schedule,
648 # this ensures that cache associativity suffices
649 lea -64-63(%rcx),%r10
650 sub %rsp,%r10
651 neg %r10
652 and \$0x3C0,%r10
653 sub %r10,%rsp
654 #add \$8,%rsp # 8 is reserved for callee's ra
656 mov %rdi,$inp # inp argument
657 mov %rsi,$out # out argument
658 mov %r8,%rbx # ivp argument
659 mov %rcx,$key # key argument
660 mov 272(%rcx),${keyend}d # grandRounds
662 mov %r8,$_ivp
663 mov %rbp,$_rsp
665 .Lcbc_body:
666 lea .LCamellia_SBOX(%rip),$Tbl
668 mov \$32,%ecx
669 .align 4
670 .Lcbc_prefetch_sbox:
671 mov 0($Tbl),%rax
672 mov 32($Tbl),%rsi
673 mov 64($Tbl),%rdi
674 mov 96($Tbl),%r11
675 lea 128($Tbl),$Tbl
676 loop .Lcbc_prefetch_sbox
677 sub \$4096,$Tbl
678 shl \$6,$keyend
679 mov %rdx,%rcx # len argument
680 lea ($key,$keyend),$keyend
682 cmp \$0,%r9d # enc argument
683 je .LCBC_DECRYPT
685 and \$-16,%rdx
686 and \$15,%rcx # length residue
687 lea ($inp,%rdx),%rdx
688 mov $key,$_key
689 mov %rdx,$_end
690 mov %rcx,$_res
692 cmp $inp,%rdx
693 mov 0(%rbx),@S[0] # load IV
694 mov 4(%rbx),@S[1]
695 mov 8(%rbx),@S[2]
696 mov 12(%rbx),@S[3]
697 je .Lcbc_enc_tail
698 jmp .Lcbc_eloop
700 .align 16
701 .Lcbc_eloop:
702 xor 0($inp),@S[0]
703 xor 4($inp),@S[1]
704 xor 8($inp),@S[2]
705 bswap @S[0]
706 xor 12($inp),@S[3]
707 bswap @S[1]
708 bswap @S[2]
709 bswap @S[3]
711 call _x86_64_Camellia_encrypt
713 mov $_key,$key # "rewind" the key
714 bswap @S[0]
715 mov $_end,%rdx
716 bswap @S[1]
717 mov $_res,%rcx
718 bswap @S[2]
719 mov @S[0],0($out)
720 bswap @S[3]
721 mov @S[1],4($out)
722 mov @S[2],8($out)
723 lea 16($inp),$inp
724 mov @S[3],12($out)
725 cmp %rdx,$inp
726 lea 16($out),$out
727 jne .Lcbc_eloop
729 cmp \$0,%rcx
730 jne .Lcbc_enc_tail
732 mov $_ivp,$out
733 mov @S[0],0($out) # write out IV residue
734 mov @S[1],4($out)
735 mov @S[2],8($out)
736 mov @S[3],12($out)
737 jmp .Lcbc_done
739 .align 16
740 .Lcbc_enc_tail:
741 xor %rax,%rax
742 mov %rax,0+$ivec
743 mov %rax,8+$ivec
744 mov %rax,$_res
746 .Lcbc_enc_pushf:
747 pushfq
749 mov $inp,%rsi
750 lea 8+$ivec,%rdi
751 .long 0x9066A4F3 # rep movsb
752 popfq
753 .Lcbc_enc_popf:
755 lea $ivec,$inp
756 lea 16+$ivec,%rax
757 mov %rax,$_end
758 jmp .Lcbc_eloop # one more time
760 .align 16
761 .LCBC_DECRYPT:
762 xchg $key,$keyend
763 add \$15,%rdx
764 and \$15,%rcx # length residue
765 and \$-16,%rdx
766 mov $key,$_key
767 lea ($inp,%rdx),%rdx
768 mov %rdx,$_end
769 mov %rcx,$_res
771 mov (%rbx),%rax # load IV
772 mov 8(%rbx),%rbx
773 jmp .Lcbc_dloop
774 .align 16
775 .Lcbc_dloop:
776 mov 0($inp),@S[0]
777 mov 4($inp),@S[1]
778 mov 8($inp),@S[2]
779 bswap @S[0]
780 mov 12($inp),@S[3]
781 bswap @S[1]
782 mov %rax,0+$ivec # save IV to temporary storage
783 bswap @S[2]
784 mov %rbx,8+$ivec
785 bswap @S[3]
787 call _x86_64_Camellia_decrypt
789 mov $_key,$key # "rewind" the key
790 mov $_end,%rdx
791 mov $_res,%rcx
793 bswap @S[0]
794 mov ($inp),%rax # load IV for next iteration
795 bswap @S[1]
796 mov 8($inp),%rbx
797 bswap @S[2]
798 xor 0+$ivec,@S[0]
799 bswap @S[3]
800 xor 4+$ivec,@S[1]
801 xor 8+$ivec,@S[2]
802 lea 16($inp),$inp
803 xor 12+$ivec,@S[3]
804 cmp %rdx,$inp
805 je .Lcbc_ddone
807 mov @S[0],0($out)
808 mov @S[1],4($out)
809 mov @S[2],8($out)
810 mov @S[3],12($out)
812 lea 16($out),$out
813 jmp .Lcbc_dloop
815 .align 16
816 .Lcbc_ddone:
817 mov $_ivp,%rdx
818 cmp \$0,%rcx
819 jne .Lcbc_dec_tail
821 mov @S[0],0($out)
822 mov @S[1],4($out)
823 mov @S[2],8($out)
824 mov @S[3],12($out)
826 mov %rax,(%rdx) # write out IV residue
827 mov %rbx,8(%rdx)
828 jmp .Lcbc_done
829 .align 16
830 .Lcbc_dec_tail:
831 mov @S[0],0+$ivec
832 mov @S[1],4+$ivec
833 mov @S[2],8+$ivec
834 mov @S[3],12+$ivec
836 .Lcbc_dec_pushf:
837 pushfq
839 lea 8+$ivec,%rsi
840 lea ($out),%rdi
841 .long 0x9066A4F3 # rep movsb
842 popfq
843 .Lcbc_dec_popf:
845 mov %rax,(%rdx) # write out IV residue
846 mov %rbx,8(%rdx)
847 jmp .Lcbc_done
849 .align 16
850 .Lcbc_done:
851 mov $_rsp,%rcx
852 mov 0(%rcx),%r15
853 mov 8(%rcx),%r14
854 mov 16(%rcx),%r13
855 mov 24(%rcx),%r12
856 mov 32(%rcx),%rbp
857 mov 40(%rcx),%rbx
858 lea 48(%rcx),%rsp
859 .Lcbc_abort:
861 .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
863 .asciz "Camellia for x86_64 by <appro\@openssl.org>"
867 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
868 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
869 if ($win64) {
870 $rec="%rcx";
871 $frame="%rdx";
872 $context="%r8";
873 $disp="%r9";
875 $code.=<<___;
876 .extern __imp_RtlVirtualUnwind
877 .type common_se_handler,\@abi-omnipotent
878 .align 16
879 common_se_handler:
880 push %rsi
881 push %rdi
882 push %rbx
883 push %rbp
884 push %r12
885 push %r13
886 push %r14
887 push %r15
888 pushfq
889 lea -64(%rsp),%rsp
891 mov 120($context),%rax # pull context->Rax
892 mov 248($context),%rbx # pull context->Rip
894 mov 8($disp),%rsi # disp->ImageBase
895 mov 56($disp),%r11 # disp->HandlerData
897 mov 0(%r11),%r10d # HandlerData[0]
898 lea (%rsi,%r10),%r10 # prologue label
899 cmp %r10,%rbx # context->Rip<prologue label
900 jb .Lin_prologue
902 mov 152($context),%rax # pull context->Rsp
904 mov 4(%r11),%r10d # HandlerData[1]
905 lea (%rsi,%r10),%r10 # epilogue label
906 cmp %r10,%rbx # context->Rip>=epilogue label
907 jae .Lin_prologue
909 lea 40(%rax),%rax
910 mov -8(%rax),%rbx
911 mov -16(%rax),%rbp
912 mov -24(%rax),%r13
913 mov -32(%rax),%r14
914 mov -40(%rax),%r15
915 mov %rbx,144($context) # restore context->Rbx
916 mov %rbp,160($context) # restore context->Rbp
917 mov %r13,224($context) # restore context->R13
918 mov %r14,232($context) # restore context->R14
919 mov %r15,240($context) # restore context->R15
921 .Lin_prologue:
922 mov 8(%rax),%rdi
923 mov 16(%rax),%rsi
924 mov %rax,152($context) # restore context->Rsp
925 mov %rsi,168($context) # restore context->Rsi
926 mov %rdi,176($context) # restore context->Rdi
928 jmp .Lcommon_seh_exit
929 .size common_se_handler,.-common_se_handler
931 .type cbc_se_handler,\@abi-omnipotent
932 .align 16
933 cbc_se_handler:
934 push %rsi
935 push %rdi
936 push %rbx
937 push %rbp
938 push %r12
939 push %r13
940 push %r14
941 push %r15
942 pushfq
943 lea -64(%rsp),%rsp
945 mov 120($context),%rax # pull context->Rax
946 mov 248($context),%rbx # pull context->Rip
948 lea .Lcbc_prologue(%rip),%r10
949 cmp %r10,%rbx # context->Rip<.Lcbc_prologue
950 jb .Lin_cbc_prologue
952 lea .Lcbc_body(%rip),%r10
953 cmp %r10,%rbx # context->Rip<.Lcbc_body
954 jb .Lin_cbc_frame_setup
956 mov 152($context),%rax # pull context->Rsp
958 lea .Lcbc_abort(%rip),%r10
959 cmp %r10,%rbx # context->Rip>=.Lcbc_abort
960 jae .Lin_cbc_prologue
962 # handle pushf/popf in Camellia_cbc_encrypt
963 lea .Lcbc_enc_pushf(%rip),%r10
964 cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf
965 jbe .Lin_cbc_no_flag
966 lea 8(%rax),%rax
967 lea .Lcbc_enc_popf(%rip),%r10
968 cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf
969 jb .Lin_cbc_no_flag
970 lea -8(%rax),%rax
971 lea .Lcbc_dec_pushf(%rip),%r10
972 cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf
973 jbe .Lin_cbc_no_flag
974 lea 8(%rax),%rax
975 lea .Lcbc_dec_popf(%rip),%r10
976 cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf
977 jb .Lin_cbc_no_flag
978 lea -8(%rax),%rax
980 .Lin_cbc_no_flag:
981 mov 48(%rax),%rax # $_rsp
982 lea 48(%rax),%rax
984 .Lin_cbc_frame_setup:
985 mov -8(%rax),%rbx
986 mov -16(%rax),%rbp
987 mov -24(%rax),%r12
988 mov -32(%rax),%r13
989 mov -40(%rax),%r14
990 mov -48(%rax),%r15
991 mov %rbx,144($context) # restore context->Rbx
992 mov %rbp,160($context) # restore context->Rbp
993 mov %r12,216($context) # restore context->R12
994 mov %r13,224($context) # restore context->R13
995 mov %r14,232($context) # restore context->R14
996 mov %r15,240($context) # restore context->R15
998 .Lin_cbc_prologue:
999 mov 8(%rax),%rdi
1000 mov 16(%rax),%rsi
1001 mov %rax,152($context) # restore context->Rsp
1002 mov %rsi,168($context) # restore context->Rsi
1003 mov %rdi,176($context) # restore context->Rdi
1005 .align 4
1006 .Lcommon_seh_exit:
1008 mov 40($disp),%rdi # disp->ContextRecord
1009 mov $context,%rsi # context
1010 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1011 .long 0xa548f3fc # cld; rep movsq
1013 mov $disp,%rsi
1014 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1015 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1016 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1017 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1018 mov 40(%rsi),%r10 # disp->ContextRecord
1019 lea 56(%rsi),%r11 # &disp->HandlerData
1020 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1021 mov %r10,32(%rsp) # arg5
1022 mov %r11,40(%rsp) # arg6
1023 mov %r12,48(%rsp) # arg7
1024 mov %rcx,56(%rsp) # arg8, (NULL)
1025 call *__imp_RtlVirtualUnwind(%rip)
1027 mov \$1,%eax # ExceptionContinueSearch
1028 lea 64(%rsp),%rsp
1029 popfq
1030 pop %r15
1031 pop %r14
1032 pop %r13
1033 pop %r12
1034 pop %rbp
1035 pop %rbx
1036 pop %rdi
1037 pop %rsi
1039 .size cbc_se_handler,.-cbc_se_handler
1041 .section .pdata
1042 .align 4
1043 .rva .LSEH_begin_Camellia_EncryptBlock_Rounds
1044 .rva .LSEH_end_Camellia_EncryptBlock_Rounds
1045 .rva .LSEH_info_Camellia_EncryptBlock_Rounds
1047 .rva .LSEH_begin_Camellia_DecryptBlock_Rounds
1048 .rva .LSEH_end_Camellia_DecryptBlock_Rounds
1049 .rva .LSEH_info_Camellia_DecryptBlock_Rounds
1051 .rva .LSEH_begin_Camellia_Ekeygen
1052 .rva .LSEH_end_Camellia_Ekeygen
1053 .rva .LSEH_info_Camellia_Ekeygen
1055 .rva .LSEH_begin_Camellia_cbc_encrypt
1056 .rva .LSEH_end_Camellia_cbc_encrypt
1057 .rva .LSEH_info_Camellia_cbc_encrypt
1059 .section .xdata
1060 .align 8
1061 .LSEH_info_Camellia_EncryptBlock_Rounds:
1062 .byte 9,0,0,0
1063 .rva common_se_handler
1064 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
1065 .LSEH_info_Camellia_DecryptBlock_Rounds:
1066 .byte 9,0,0,0
1067 .rva common_se_handler
1068 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
1069 .LSEH_info_Camellia_Ekeygen:
1070 .byte 9,0,0,0
1071 .rva common_se_handler
1072 .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[]
1073 .LSEH_info_Camellia_cbc_encrypt:
1074 .byte 9,0,0,0
1075 .rva cbc_se_handler
1079 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1080 print $code;
1081 close STDOUT;