3 # ====================================================================
4 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
6 # This module may be used under the terms of either the GNU General
7 # Public License version 2 or later, the GNU Lesser General Public
8 # License version 2.1 or later, the Mozilla Public License version
9 # 1.1 or the BSD License. The exact terms of either license are
10 # distributed along with this module. For further details see
11 # http://www.openssl.org/~appro/camellia/.
12 # ====================================================================
14 # Performance in cycles per processed byte (less is better) in
15 # 'openssl speed ...' benchmark:
18 # -evp camellia-128-ecb 16.7 21.0 22.7
19 # + over gcc 3.4.6 +25% +5% 0%
21 # camellia-128-cbc 15.7 20.4 21.1
23 # 128-bit key setup 128 216 205 cycles/key
24 # + over gcc 3.4.6 +54% +39% +15%
26 # Numbers in "+" rows represent performance improvement over compiler
27 # generated code. Key setup timings are impressive on AMD and Core2
28 # thanks to 64-bit operations being covertly deployed. Improvement on
29 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
30 # apparently emulates some of 64-bit operations in [32-bit] microcode.
34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
36 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
37 ( $xlate="${dir}x86_64-xlate.pl" and -f
$xlate ) or
38 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f
$xlate) or
39 die "can't locate x86_64-xlate.pl";
41 open OUT
,"| \"$^X\" $xlate $flavour $output";
44 sub hi
() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
45 sub lo
() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
46 $r =~ s/%[er]([sd]i)/%\1l/;
47 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
49 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
50 @S=("%r8d","%r9d","%r10d","%r11d");
53 $Tbl="%rbp"; # size optimization
60 # const unsigned int Camellia_SBOX[4][256];
61 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
62 # and [2][] - with [3][]. This is done to minimize code size.
63 $SBOX1_1110=0; # Camellia_SBOX[0]
64 $SBOX4_4404=4; # Camellia_SBOX[1]
65 $SBOX2_0222=2048; # Camellia_SBOX[2]
66 $SBOX3_3033=2052; # Camellia_SBOX[3]
68 sub Camellia_Feistel
{
70 my $seed=defined(@_[1])?
@_[1]:0;
71 my $scale=$seed<0?
-8:8;
73 my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
76 xor $s0,$t0 # t0^=key[0]
77 xor $s1,$t1 # t1^=key[1]
78 movz
`&hi("$t0")`,$i0 # (t0>>8)&0xff
79 movz
`&lo("$t1")`,$i1 # (t1>>0)&0xff
80 mov
$SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0]
81 mov
$SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1]
82 movz
`&lo("$t0")`,$i0 # (t0>>0)&0xff
84 movz
`&hi("$t1")`,$i1 # (t1>>8)&0xff
85 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0]
87 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1]
88 movz
`&hi("$t0")`,$i0 # (t0>>24)&0xff
89 movz
`&lo("$t1")`,$i1 # (t1>>16)&0xff
90 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0]
91 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1]
92 movz
`&lo("$t0")`,$i0 # (t0>>16)&0xff
93 movz
`&hi("$t1")`,$i1 # (t1>>24)&0xff
94 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0]
95 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1]
96 mov
`$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
97 mov
`$seed+($i+1)*$scale+4`($key),$t0
99 ror \
$8,$t3 # t3=RightRotate(t3,8)
106 # void Camellia_EncryptBlock_Rounds(
108 # const Byte plaintext[],
109 # const KEY_TABLE_TYPE keyTable,
115 .globl Camellia_EncryptBlock
116 .type Camellia_EncryptBlock
,\
@abi-omnipotent
118 Camellia_EncryptBlock
:
122 adcl \
$0,$arg0d # keyBitLength==128?3:4
124 .size Camellia_EncryptBlock
,.-Camellia_EncryptBlock
126 .globl Camellia_EncryptBlock_Rounds
127 .type Camellia_EncryptBlock_Rounds
,\
@function,4
130 Camellia_EncryptBlock_Rounds
:
138 #mov %rsi,$inp # put away arguments
142 shl \
$6,%edi # process grandRounds
143 lea
.LCamellia_SBOX
(%rip),$Tbl
144 lea
($key,%rdi),$keyend
146 mov
0(%rsi),@S[0] # load plaintext
155 call _x86_64_Camellia_encrypt
174 .size Camellia_EncryptBlock_Rounds
,.-Camellia_EncryptBlock_Rounds
176 .type _x86_64_Camellia_encrypt
,\
@abi-omnipotent
178 _x86_64_Camellia_encrypt
:
180 xor 4($key),@S[0] # ^=key[0-3]
185 mov
16($key),$t1 # prefetch key[4-5]
189 for ($i=0;$i<6;$i++) { Camellia_Feistel
($i,16); }
193 mov
8($key),$t3 # prefetch key[2-3]
200 xor $t3,@S[2] # s2^=s3|key[3];
201 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
205 xor $t1,@S[0] # s0^=s1|key[1];
206 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
211 xor @S[2],$t0 # SwapHalf
221 .byte
0xf3,0xc3 # rep ret
222 .size _x86_64_Camellia_encrypt
,.-_x86_64_Camellia_encrypt
225 .globl Camellia_DecryptBlock
226 .type Camellia_DecryptBlock
,\
@abi-omnipotent
228 Camellia_DecryptBlock
:
232 adcl \
$0,$arg0d # keyBitLength==128?3:4
234 .size Camellia_DecryptBlock
,.-Camellia_DecryptBlock
236 .globl Camellia_DecryptBlock_Rounds
237 .type Camellia_DecryptBlock_Rounds
,\
@function,4
240 Camellia_DecryptBlock_Rounds
:
248 #mov %rsi,$inp # put away arguments
252 shl \
$6,%edi # process grandRounds
253 lea
.LCamellia_SBOX
(%rip),$Tbl
254 lea
($keyend,%rdi),$key
256 mov
0(%rsi),@S[0] # load plaintext
265 call _x86_64_Camellia_decrypt
284 .size Camellia_DecryptBlock_Rounds
,.-Camellia_DecryptBlock_Rounds
286 .type _x86_64_Camellia_decrypt
,\
@abi-omnipotent
288 _x86_64_Camellia_decrypt
:
290 xor 4($key),@S[0] # ^=key[0-3]
295 mov
-8($key),$t1 # prefetch key[4-5]
299 for ($i=0;$i<6;$i++) { Camellia_Feistel
($i,-8); }
303 mov
0($key),$t3 # prefetch key[2-3]
310 xor $t3,@S[2] # s2^=s3|key[3];
311 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
315 xor $t1,@S[0] # s0^=s1|key[1];
316 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
327 mov
$t2,@S[0] # SwapHalf
332 .byte
0xf3,0xc3 # rep ret
333 .size _x86_64_Camellia_decrypt
,.-_x86_64_Camellia_decrypt
337 my ($rnd,$key,@T)=@_;
338 my $bias=int(@T[0])?
shift(@T):0;
342 mov
@T[1],`$bias+$rnd*8+0`($key)
343 mov
@T[0],`$bias+$rnd*8+4`($key)
344 mov
@T[3],`$bias+$rnd*8+8`($key)
345 mov
@T[2],`$bias+$rnd*8+12`($key)
348 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n";
349 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
354 my ($rnd,$key,@T)=@_;
355 my $bias=int(@T[0])?
shift(@T):0;
357 $code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n";
358 $code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
361 # shld is very slow on Intel EM64T family. Even on AMD it limits
362 # instruction decode rate [because it's VectorPath] and consequently
365 my ($i0,$i1,$rot)=@_;
376 # ... Implementing 128-bit rotate without shld gives 80% better
377 # performance EM64T, +15% on AMD64 and only ~7% degradation on
378 # Core2. This is therefore preferred.
380 my ($i0,$i1,$rot)=@_;
399 .globl Camellia_Ekeygen
400 .type Camellia_Ekeygen
,\
@function,3
410 mov
%rdi,$keyend # put away arguments, keyBitLength
411 mov
%rdx,$out # keyTable
413 mov
0(%rsi),@S[0] # load 0-127 bits
423 &_saveround
(0,$out,@S); # KL<<<0
425 cmp \
$128,$keyend # check keyBitLength
428 mov
16(%rsi),@S[0] # load 128-191 bits
432 mov
24(%rsi),@S[2] # load 192-255 bits
446 &_saveround
(4,$out,@S); # temp storage for KR!
448 xor 0($out),@S[1] # KR^KL
454 lea
.LCamellia_SIGMA
(%rip),$key
455 lea
.LCamellia_SBOX
(%rip),$Tbl
460 &Camellia_Feistel
($step++);
461 &Camellia_Feistel
($step++);
463 xor 0($out),@S[1] # ^KL
468 &Camellia_Feistel
($step++);
469 &Camellia_Feistel
($step++);
474 lea
128($out),$out # size optimization
475 shl \
$32,%r8 # @S[0]||
476 shl \
$32,%r10 # @S[2]||
478 or %r11,%r10 # ||@S[3]
480 &_loadround
(0,$out,-128,"%rax","%rbx"); # KL
481 &_saveround
(2,$out,-128,"%r8","%r10"); # KA<<<0
482 &_rotl128
("%rax","%rbx",15);
483 &_saveround
(4,$out,-128,"%rax","%rbx"); # KL<<<15
484 &_rotl128
("%r8","%r10",15);
485 &_saveround
(6,$out,-128,"%r8","%r10"); # KA<<<15
486 &_rotl128
("%r8","%r10",15); # 15+15=30
487 &_saveround
(8,$out,-128,"%r8","%r10"); # KA<<<30
488 &_rotl128
("%rax","%rbx",30); # 15+30=45
489 &_saveround
(10,$out,-128,"%rax","%rbx"); # KL<<<45
490 &_rotl128
("%r8","%r10",15); # 30+15=45
491 &_saveround
(12,$out,-128,"%r8"); # KA<<<45
492 &_rotl128
("%rax","%rbx",15); # 45+15=60
493 &_saveround
(13,$out,-128,"%rbx"); # KL<<<60
494 &_rotl128
("%r8","%r10",15); # 45+15=60
495 &_saveround
(14,$out,-128,"%r8","%r10"); # KA<<<60
496 &_rotl128
("%rax","%rbx",17); # 60+17=77
497 &_saveround
(16,$out,-128,"%rax","%rbx"); # KL<<<77
498 &_rotl128
("%rax","%rbx",17); # 77+17=94
499 &_saveround
(18,$out,-128,"%rax","%rbx"); # KL<<<94
500 &_rotl128
("%r8","%r10",34); # 60+34=94
501 &_saveround
(20,$out,-128,"%r8","%r10"); # KA<<<94
502 &_rotl128
("%rax","%rbx",17); # 94+17=111
503 &_saveround
(22,$out,-128,"%rax","%rbx"); # KL<<<111
504 &_rotl128
("%r8","%r10",17); # 94+17=111
505 &_saveround
(24,$out,-128,"%r8","%r10"); # KA<<<111
512 &_saveround
(6,$out,@S); # temp storage for KA!
514 xor `4*8+0`($out),@S[1] # KA^KR
515 xor `4*8+4`($out),@S[0]
516 xor `5*8+0`($out),@S[3]
517 xor `5*8+4`($out),@S[2]
519 &Camellia_Feistel
($step++);
520 &Camellia_Feistel
($step++);
522 &_loadround
(0,$out,"%rax","%rbx"); # KL
523 &_loadround
(4,$out,"%rcx","%rdx"); # KR
524 &_loadround
(6,$out,"%r14","%r15"); # KA
526 lea
128($out),$out # size optimization
527 shl \
$32,%r8 # @S[0]||
528 shl \
$32,%r10 # @S[2]||
530 or %r11,%r10 # ||@S[3]
532 &_saveround
(2,$out,-128,"%r8","%r10"); # KB<<<0
533 &_rotl128
("%rcx","%rdx",15);
534 &_saveround
(4,$out,-128,"%rcx","%rdx"); # KR<<<15
535 &_rotl128
("%r14","%r15",15);
536 &_saveround
(6,$out,-128,"%r14","%r15"); # KA<<<15
537 &_rotl128
("%rcx","%rdx",15); # 15+15=30
538 &_saveround
(8,$out,-128,"%rcx","%rdx"); # KR<<<30
539 &_rotl128
("%r8","%r10",30);
540 &_saveround
(10,$out,-128,"%r8","%r10"); # KB<<<30
541 &_rotl128
("%rax","%rbx",45);
542 &_saveround
(12,$out,-128,"%rax","%rbx"); # KL<<<45
543 &_rotl128
("%r14","%r15",30); # 15+30=45
544 &_saveround
(14,$out,-128,"%r14","%r15"); # KA<<<45
545 &_rotl128
("%rax","%rbx",15); # 45+15=60
546 &_saveround
(16,$out,-128,"%rax","%rbx"); # KL<<<60
547 &_rotl128
("%rcx","%rdx",30); # 30+30=60
548 &_saveround
(18,$out,-128,"%rcx","%rdx"); # KR<<<60
549 &_rotl128
("%r8","%r10",30); # 30+30=60
550 &_saveround
(20,$out,-128,"%r8","%r10"); # KB<<<60
551 &_rotl128
("%rax","%rbx",17); # 60+17=77
552 &_saveround
(22,$out,-128,"%rax","%rbx"); # KL<<<77
553 &_rotl128
("%r14","%r15",32); # 45+32=77
554 &_saveround
(24,$out,-128,"%r14","%r15"); # KA<<<77
555 &_rotl128
("%rcx","%rdx",34); # 60+34=94
556 &_saveround
(26,$out,-128,"%rcx","%rdx"); # KR<<<94
557 &_rotl128
("%r14","%r15",17); # 77+17=94
558 &_saveround
(28,$out,-128,"%r14","%r15"); # KA<<<77
559 &_rotl128
("%rax","%rbx",34); # 77+34=111
560 &_saveround
(30,$out,-128,"%rax","%rbx"); # KL<<<111
561 &_rotl128
("%r8","%r10",51); # 60+51=111
562 &_saveround
(32,$out,-128,"%r8","%r10"); # KB<<<111
574 .size Camellia_Ekeygen
,.-Camellia_Ekeygen
579 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
580 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
581 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
582 166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
583 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
584 223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
585 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
586 254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
587 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
588 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
589 135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
590 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
591 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
592 120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
593 114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
594 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
596 sub S1110
{ my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
597 sub S4404
{ my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
598 sub S0222
{ my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
599 sub S3033
{ my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
604 .long
0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
605 .long
0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
606 .long
0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
610 # tables are interleaved, remember?
611 sub data_word
{ $code.=".long\t".join(',',@_)."\n"; }
612 for ($i=0;$i<256;$i++) { &data_word
(&S1110
($i),&S4404
($i)); }
613 for ($i=0;$i<256;$i++) { &data_word
(&S0222
($i),&S3033
($i)); }
615 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
616 # size_t length, const CAMELLIA_KEY *key,
617 # unsigned char *ivp,const int enc);
620 $_end="8(%rsp)"; # inp+len&~15
621 $_res="16(%rsp)"; # len&15
627 .globl Camellia_cbc_encrypt
628 .type Camellia_cbc_encrypt
,\
@function,6
630 Camellia_cbc_encrypt
:
645 # place stack frame just "above mod 1024" the key schedule,
646 # this ensures that cache associativity suffices
647 lea
-64-63(%rcx),%r10
652 #add \$8,%rsp # 8 is reserved for callee's ra
654 mov
%rdi,$inp # inp argument
655 mov
%rsi,$out # out argument
656 mov
%r8,%rbx # ivp argument
657 mov
%rcx,$key # key argument
658 mov
272(%rcx),${keyend
}d
# grandRounds
664 lea
.LCamellia_SBOX
(%rip),$Tbl
674 loop .Lcbc_prefetch_sbox
677 mov
%rdx,%rcx # len argument
678 lea
($key,$keyend),$keyend
680 cmp \
$0,%r9d # enc argument
684 and \
$15,%rcx # length residue
691 mov
0(%rbx),@S[0] # load IV
709 call _x86_64_Camellia_encrypt
711 mov
$_key,$key # "rewind" the key
731 mov
@S[0],0($out) # write out IV residue
749 .long
0x9066A4F3 # rep movsb
756 jmp
.Lcbc_eloop
# one more time
762 and \
$15,%rcx # length residue
769 mov
(%rbx),%rax # load IV
780 mov
%rax,0+$ivec # save IV to temporary storage
785 call _x86_64_Camellia_decrypt
787 mov
$_key,$key # "rewind" the key
792 mov
($inp),%rax # load IV for next iteration
824 mov
%rax,(%rdx) # write out IV residue
839 .long
0x9066A4F3 # rep movsb
843 mov
%rax,(%rdx) # write out IV residue
859 .size Camellia_cbc_encrypt
,.-Camellia_cbc_encrypt
861 .asciz
"Camellia for x86_64 by <appro\@openssl.org>"
865 $code =~ s/\`([^\`]*)\`/eval $1/gem;