import libcrypto (LibreSSL 2.5.2)
[unleashed.git] / lib / libcrypto / camellia / asm / cmll-x86_64.pl
bloba171c654b2dc68c5202156927caa14f282ab8153
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
6 # This module may be used under the terms of either the GNU General
7 # Public License version 2 or later, the GNU Lesser General Public
8 # License version 2.1 or later, the Mozilla Public License version
9 # 1.1 or the BSD License. The exact terms of either license are
10 # distributed along with this module. For further details see
11 # http://www.openssl.org/~appro/camellia/.
12 # ====================================================================
14 # Performance in cycles per processed byte (less is better) in
15 # 'openssl speed ...' benchmark:
17 # AMD64 Core2 EM64T
18 # -evp camellia-128-ecb 16.7 21.0 22.7
19 # + over gcc 3.4.6 +25% +5% 0%
21 # camellia-128-cbc 15.7 20.4 21.1
23 # 128-bit key setup 128 216 205 cycles/key
24 # + over gcc 3.4.6 +54% +39% +15%
26 # Numbers in "+" rows represent performance improvement over compiler
27 # generated code. Key setup timings are impressive on AMD and Core2
28 # thanks to 64-bit operations being covertly deployed. Improvement on
29 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
30 # apparently emulates some of 64-bit operations in [32-bit] microcode.
32 $flavour = shift;
33 $output = shift;
34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
36 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
38 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
39 die "can't locate x86_64-xlate.pl";
41 open OUT,"| \"$^X\" $xlate $flavour $output";
42 *STDOUT=*OUT;
44 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
45 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
46 $r =~ s/%[er]([sd]i)/%\1l/;
47 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
49 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
50 @S=("%r8d","%r9d","%r10d","%r11d");
51 $i0="%esi";
52 $i1="%edi";
53 $Tbl="%rbp"; # size optimization
54 $inp="%r12";
55 $out="%r13";
56 $key="%r14";
57 $keyend="%r15";
58 $arg0d="%edi";
60 # const unsigned int Camellia_SBOX[4][256];
61 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
62 # and [2][] - with [3][]. This is done to minimize code size.
63 $SBOX1_1110=0; # Camellia_SBOX[0]
64 $SBOX4_4404=4; # Camellia_SBOX[1]
65 $SBOX2_0222=2048; # Camellia_SBOX[2]
66 $SBOX3_3033=2052; # Camellia_SBOX[3]
68 sub Camellia_Feistel {
69 my $i=@_[0];
70 my $seed=defined(@_[1])?@_[1]:0;
71 my $scale=$seed<0?-8:8;
72 my $j=($i&1)*2;
73 my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
75 $code.=<<___;
76 xor $s0,$t0 # t0^=key[0]
77 xor $s1,$t1 # t1^=key[1]
78 movz `&hi("$t0")`,$i0 # (t0>>8)&0xff
79 movz `&lo("$t1")`,$i1 # (t1>>0)&0xff
80 mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0]
81 mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1]
82 movz `&lo("$t0")`,$i0 # (t0>>0)&0xff
83 shr \$16,$t0
84 movz `&hi("$t1")`,$i1 # (t1>>8)&0xff
85 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0]
86 shr \$16,$t1
87 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1]
88 movz `&hi("$t0")`,$i0 # (t0>>24)&0xff
89 movz `&lo("$t1")`,$i1 # (t1>>16)&0xff
90 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0]
91 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1]
92 movz `&lo("$t0")`,$i0 # (t0>>16)&0xff
93 movz `&hi("$t1")`,$i1 # (t1>>24)&0xff
94 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0]
95 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1]
96 mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
97 mov `$seed+($i+1)*$scale+4`($key),$t0
98 xor $t3,$t2 # t2^=t3
99 ror \$8,$t3 # t3=RightRotate(t3,8)
100 xor $t2,$s2
101 xor $t2,$s3
102 xor $t3,$s3
106 # void Camellia_EncryptBlock_Rounds(
107 # int grandRounds,
108 # const Byte plaintext[],
109 # const KEY_TABLE_TYPE keyTable,
110 # Byte ciphertext[])
111 $code=<<___;
112 .text
114 # V1.x API
115 .globl Camellia_EncryptBlock
116 .type Camellia_EncryptBlock,\@abi-omnipotent
117 .align 16
118 Camellia_EncryptBlock:
119 movl \$128,%eax
120 subl $arg0d,%eax
121 movl \$3,$arg0d
122 adcl \$0,$arg0d # keyBitLength==128?3:4
123 jmp .Lenc_rounds
124 .size Camellia_EncryptBlock,.-Camellia_EncryptBlock
125 # V2
126 .globl Camellia_EncryptBlock_Rounds
127 .type Camellia_EncryptBlock_Rounds,\@function,4
128 .align 16
129 .Lenc_rounds:
130 Camellia_EncryptBlock_Rounds:
131 push %rbx
132 push %rbp
133 push %r13
134 push %r14
135 push %r15
136 .Lenc_prologue:
138 #mov %rsi,$inp # put away arguments
139 mov %rcx,$out
140 mov %rdx,$key
142 shl \$6,%edi # process grandRounds
143 lea .LCamellia_SBOX(%rip),$Tbl
144 lea ($key,%rdi),$keyend
146 mov 0(%rsi),@S[0] # load plaintext
147 mov 4(%rsi),@S[1]
148 mov 8(%rsi),@S[2]
149 bswap @S[0]
150 mov 12(%rsi),@S[3]
151 bswap @S[1]
152 bswap @S[2]
153 bswap @S[3]
155 call _x86_64_Camellia_encrypt
157 bswap @S[0]
158 bswap @S[1]
159 bswap @S[2]
160 mov @S[0],0($out)
161 bswap @S[3]
162 mov @S[1],4($out)
163 mov @S[2],8($out)
164 mov @S[3],12($out)
166 mov 0(%rsp),%r15
167 mov 8(%rsp),%r14
168 mov 16(%rsp),%r13
169 mov 24(%rsp),%rbp
170 mov 32(%rsp),%rbx
171 lea 40(%rsp),%rsp
172 .Lenc_epilogue:
174 .size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
176 .type _x86_64_Camellia_encrypt,\@abi-omnipotent
177 .align 16
178 _x86_64_Camellia_encrypt:
179 xor 0($key),@S[1]
180 xor 4($key),@S[0] # ^=key[0-3]
181 xor 8($key),@S[3]
182 xor 12($key),@S[2]
183 .align 16
184 .Leloop:
185 mov 16($key),$t1 # prefetch key[4-5]
186 mov 20($key),$t0
189 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
190 $code.=<<___;
191 lea 16*4($key),$key
192 cmp $keyend,$key
193 mov 8($key),$t3 # prefetch key[2-3]
194 mov 12($key),$t2
195 je .Ledone
197 and @S[0],$t0
198 or @S[3],$t3
199 rol \$1,$t0
200 xor $t3,@S[2] # s2^=s3|key[3];
201 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
202 and @S[2],$t2
203 or @S[1],$t1
204 rol \$1,$t2
205 xor $t1,@S[0] # s0^=s1|key[1];
206 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
207 jmp .Leloop
209 .align 16
210 .Ledone:
211 xor @S[2],$t0 # SwapHalf
212 xor @S[3],$t1
213 xor @S[0],$t2
214 xor @S[1],$t3
216 mov $t0,@S[0]
217 mov $t1,@S[1]
218 mov $t2,@S[2]
219 mov $t3,@S[3]
221 .byte 0xf3,0xc3 # rep ret
222 .size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
224 # V1.x API
225 .globl Camellia_DecryptBlock
226 .type Camellia_DecryptBlock,\@abi-omnipotent
227 .align 16
228 Camellia_DecryptBlock:
229 movl \$128,%eax
230 subl $arg0d,%eax
231 movl \$3,$arg0d
232 adcl \$0,$arg0d # keyBitLength==128?3:4
233 jmp .Ldec_rounds
234 .size Camellia_DecryptBlock,.-Camellia_DecryptBlock
235 # V2
236 .globl Camellia_DecryptBlock_Rounds
237 .type Camellia_DecryptBlock_Rounds,\@function,4
238 .align 16
239 .Ldec_rounds:
240 Camellia_DecryptBlock_Rounds:
241 push %rbx
242 push %rbp
243 push %r13
244 push %r14
245 push %r15
246 .Ldec_prologue:
248 #mov %rsi,$inp # put away arguments
249 mov %rcx,$out
250 mov %rdx,$keyend
252 shl \$6,%edi # process grandRounds
253 lea .LCamellia_SBOX(%rip),$Tbl
254 lea ($keyend,%rdi),$key
256 mov 0(%rsi),@S[0] # load plaintext
257 mov 4(%rsi),@S[1]
258 mov 8(%rsi),@S[2]
259 bswap @S[0]
260 mov 12(%rsi),@S[3]
261 bswap @S[1]
262 bswap @S[2]
263 bswap @S[3]
265 call _x86_64_Camellia_decrypt
267 bswap @S[0]
268 bswap @S[1]
269 bswap @S[2]
270 mov @S[0],0($out)
271 bswap @S[3]
272 mov @S[1],4($out)
273 mov @S[2],8($out)
274 mov @S[3],12($out)
276 mov 0(%rsp),%r15
277 mov 8(%rsp),%r14
278 mov 16(%rsp),%r13
279 mov 24(%rsp),%rbp
280 mov 32(%rsp),%rbx
281 lea 40(%rsp),%rsp
282 .Ldec_epilogue:
284 .size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
286 .type _x86_64_Camellia_decrypt,\@abi-omnipotent
287 .align 16
288 _x86_64_Camellia_decrypt:
289 xor 0($key),@S[1]
290 xor 4($key),@S[0] # ^=key[0-3]
291 xor 8($key),@S[3]
292 xor 12($key),@S[2]
293 .align 16
294 .Ldloop:
295 mov -8($key),$t1 # prefetch key[4-5]
296 mov -4($key),$t0
299 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
300 $code.=<<___;
301 lea -16*4($key),$key
302 cmp $keyend,$key
303 mov 0($key),$t3 # prefetch key[2-3]
304 mov 4($key),$t2
305 je .Lddone
307 and @S[0],$t0
308 or @S[3],$t3
309 rol \$1,$t0
310 xor $t3,@S[2] # s2^=s3|key[3];
311 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
312 and @S[2],$t2
313 or @S[1],$t1
314 rol \$1,$t2
315 xor $t1,@S[0] # s0^=s1|key[1];
316 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
318 jmp .Ldloop
320 .align 16
321 .Lddone:
322 xor @S[2],$t2
323 xor @S[3],$t3
324 xor @S[0],$t0
325 xor @S[1],$t1
327 mov $t2,@S[0] # SwapHalf
328 mov $t3,@S[1]
329 mov $t0,@S[2]
330 mov $t1,@S[3]
332 .byte 0xf3,0xc3 # rep ret
333 .size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
336 sub _saveround {
337 my ($rnd,$key,@T)=@_;
338 my $bias=int(@T[0])?shift(@T):0;
340 if ($#T==3) {
341 $code.=<<___;
342 mov @T[1],`$bias+$rnd*8+0`($key)
343 mov @T[0],`$bias+$rnd*8+4`($key)
344 mov @T[3],`$bias+$rnd*8+8`($key)
345 mov @T[2],`$bias+$rnd*8+12`($key)
347 } else {
348 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n";
349 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
353 sub _loadround {
354 my ($rnd,$key,@T)=@_;
355 my $bias=int(@T[0])?shift(@T):0;
357 $code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n";
358 $code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
361 # shld is very slow on Intel EM64T family. Even on AMD it limits
362 # instruction decode rate [because it's VectorPath] and consequently
363 # performance...
364 sub __rotl128 {
365 my ($i0,$i1,$rot)=@_;
367 if ($rot) {
368 $code.=<<___;
369 mov $i0,%r11
370 shld \$$rot,$i1,$i0
371 shld \$$rot,%r11,$i1
376 # ... Implementing 128-bit rotate without shld gives 80% better
377 # performance EM64T, +15% on AMD64 and only ~7% degradation on
378 # Core2. This is therefore preferred.
379 sub _rotl128 {
380 my ($i0,$i1,$rot)=@_;
382 if ($rot) {
383 $code.=<<___;
384 mov $i0,%r11
385 shl \$$rot,$i0
386 mov $i1,%r9
387 shr \$`64-$rot`,%r9
388 shr \$`64-$rot`,%r11
389 or %r9,$i0
390 shl \$$rot,$i1
391 or %r11,$i1
396 { my $step=0;
398 $code.=<<___;
399 .globl Camellia_Ekeygen
400 .type Camellia_Ekeygen,\@function,3
401 .align 16
402 Camellia_Ekeygen:
403 push %rbx
404 push %rbp
405 push %r13
406 push %r14
407 push %r15
408 .Lkey_prologue:
410 mov %rdi,$keyend # put away arguments, keyBitLength
411 mov %rdx,$out # keyTable
413 mov 0(%rsi),@S[0] # load 0-127 bits
414 mov 4(%rsi),@S[1]
415 mov 8(%rsi),@S[2]
416 mov 12(%rsi),@S[3]
418 bswap @S[0]
419 bswap @S[1]
420 bswap @S[2]
421 bswap @S[3]
423 &_saveround (0,$out,@S); # KL<<<0
424 $code.=<<___;
425 cmp \$128,$keyend # check keyBitLength
426 je .L1st128
428 mov 16(%rsi),@S[0] # load 128-191 bits
429 mov 20(%rsi),@S[1]
430 cmp \$192,$keyend
431 je .L1st192
432 mov 24(%rsi),@S[2] # load 192-255 bits
433 mov 28(%rsi),@S[3]
434 jmp .L1st256
435 .L1st192:
436 mov @S[0],@S[2]
437 mov @S[1],@S[3]
438 not @S[2]
439 not @S[3]
440 .L1st256:
441 bswap @S[0]
442 bswap @S[1]
443 bswap @S[2]
444 bswap @S[3]
446 &_saveround (4,$out,@S); # temp storage for KR!
447 $code.=<<___;
448 xor 0($out),@S[1] # KR^KL
449 xor 4($out),@S[0]
450 xor 8($out),@S[3]
451 xor 12($out),@S[2]
453 .L1st128:
454 lea .LCamellia_SIGMA(%rip),$key
455 lea .LCamellia_SBOX(%rip),$Tbl
457 mov 0($key),$t1
458 mov 4($key),$t0
460 &Camellia_Feistel($step++);
461 &Camellia_Feistel($step++);
462 $code.=<<___;
463 xor 0($out),@S[1] # ^KL
464 xor 4($out),@S[0]
465 xor 8($out),@S[3]
466 xor 12($out),@S[2]
468 &Camellia_Feistel($step++);
469 &Camellia_Feistel($step++);
470 $code.=<<___;
471 cmp \$128,$keyend
472 jne .L2nd256
474 lea 128($out),$out # size optimization
475 shl \$32,%r8 # @S[0]||
476 shl \$32,%r10 # @S[2]||
477 or %r9,%r8 # ||@S[1]
478 or %r11,%r10 # ||@S[3]
480 &_loadround (0,$out,-128,"%rax","%rbx"); # KL
481 &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0
482 &_rotl128 ("%rax","%rbx",15);
483 &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15
484 &_rotl128 ("%r8","%r10",15);
485 &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15
486 &_rotl128 ("%r8","%r10",15); # 15+15=30
487 &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30
488 &_rotl128 ("%rax","%rbx",30); # 15+30=45
489 &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45
490 &_rotl128 ("%r8","%r10",15); # 30+15=45
491 &_saveround (12,$out,-128,"%r8"); # KA<<<45
492 &_rotl128 ("%rax","%rbx",15); # 45+15=60
493 &_saveround (13,$out,-128,"%rbx"); # KL<<<60
494 &_rotl128 ("%r8","%r10",15); # 45+15=60
495 &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60
496 &_rotl128 ("%rax","%rbx",17); # 60+17=77
497 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77
498 &_rotl128 ("%rax","%rbx",17); # 77+17=94
499 &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94
500 &_rotl128 ("%r8","%r10",34); # 60+34=94
501 &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94
502 &_rotl128 ("%rax","%rbx",17); # 94+17=111
503 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111
504 &_rotl128 ("%r8","%r10",17); # 94+17=111
505 &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111
506 $code.=<<___;
507 mov \$3,%eax
508 jmp .Ldone
509 .align 16
510 .L2nd256:
512 &_saveround (6,$out,@S); # temp storage for KA!
513 $code.=<<___;
514 xor `4*8+0`($out),@S[1] # KA^KR
515 xor `4*8+4`($out),@S[0]
516 xor `5*8+0`($out),@S[3]
517 xor `5*8+4`($out),@S[2]
519 &Camellia_Feistel($step++);
520 &Camellia_Feistel($step++);
522 &_loadround (0,$out,"%rax","%rbx"); # KL
523 &_loadround (4,$out,"%rcx","%rdx"); # KR
524 &_loadround (6,$out,"%r14","%r15"); # KA
525 $code.=<<___;
526 lea 128($out),$out # size optimization
527 shl \$32,%r8 # @S[0]||
528 shl \$32,%r10 # @S[2]||
529 or %r9,%r8 # ||@S[1]
530 or %r11,%r10 # ||@S[3]
532 &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0
533 &_rotl128 ("%rcx","%rdx",15);
534 &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15
535 &_rotl128 ("%r14","%r15",15);
536 &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15
537 &_rotl128 ("%rcx","%rdx",15); # 15+15=30
538 &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30
539 &_rotl128 ("%r8","%r10",30);
540 &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30
541 &_rotl128 ("%rax","%rbx",45);
542 &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45
543 &_rotl128 ("%r14","%r15",30); # 15+30=45
544 &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45
545 &_rotl128 ("%rax","%rbx",15); # 45+15=60
546 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60
547 &_rotl128 ("%rcx","%rdx",30); # 30+30=60
548 &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60
549 &_rotl128 ("%r8","%r10",30); # 30+30=60
550 &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60
551 &_rotl128 ("%rax","%rbx",17); # 60+17=77
552 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77
553 &_rotl128 ("%r14","%r15",32); # 45+32=77
554 &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77
555 &_rotl128 ("%rcx","%rdx",34); # 60+34=94
556 &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94
557 &_rotl128 ("%r14","%r15",17); # 77+17=94
558 &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77
559 &_rotl128 ("%rax","%rbx",34); # 77+34=111
560 &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111
561 &_rotl128 ("%r8","%r10",51); # 60+51=111
562 &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111
563 $code.=<<___;
564 mov \$4,%eax
565 .Ldone:
566 mov 0(%rsp),%r15
567 mov 8(%rsp),%r14
568 mov 16(%rsp),%r13
569 mov 24(%rsp),%rbp
570 mov 32(%rsp),%rbx
571 lea 40(%rsp),%rsp
572 .Lkey_epilogue:
574 .size Camellia_Ekeygen,.-Camellia_Ekeygen
578 @SBOX=(
579 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
580 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
581 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
582 166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
583 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
584 223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
585 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
586 254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
587 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
588 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
589 135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
590 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
591 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
592 120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
593 114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
594 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
596 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
597 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
598 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
599 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
601 $code.=<<___;
602 .align 64
603 .LCamellia_SIGMA:
604 .long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
605 .long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
606 .long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
607 .long 0, 0, 0, 0
608 .LCamellia_SBOX:
610 # tables are interleaved, remember?
611 sub data_word { $code.=".long\t".join(',',@_)."\n"; }
612 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
613 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
615 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
616 # size_t length, const CAMELLIA_KEY *key,
617 # unsigned char *ivp,const int enc);
619 $_key="0(%rsp)";
620 $_end="8(%rsp)"; # inp+len&~15
621 $_res="16(%rsp)"; # len&15
622 $ivec="24(%rsp)";
623 $_ivp="40(%rsp)";
624 $_rsp="48(%rsp)";
626 $code.=<<___;
627 .globl Camellia_cbc_encrypt
628 .type Camellia_cbc_encrypt,\@function,6
629 .align 16
630 Camellia_cbc_encrypt:
631 cmp \$0,%rdx
632 je .Lcbc_abort
633 push %rbx
634 push %rbp
635 push %r12
636 push %r13
637 push %r14
638 push %r15
639 .Lcbc_prologue:
641 mov %rsp,%rbp
642 sub \$64,%rsp
643 and \$-64,%rsp
645 # place stack frame just "above mod 1024" the key schedule,
646 # this ensures that cache associativity suffices
647 lea -64-63(%rcx),%r10
648 sub %rsp,%r10
649 neg %r10
650 and \$0x3C0,%r10
651 sub %r10,%rsp
652 #add \$8,%rsp # 8 is reserved for callee's ra
654 mov %rdi,$inp # inp argument
655 mov %rsi,$out # out argument
656 mov %r8,%rbx # ivp argument
657 mov %rcx,$key # key argument
658 mov 272(%rcx),${keyend}d # grandRounds
660 mov %r8,$_ivp
661 mov %rbp,$_rsp
663 .Lcbc_body:
664 lea .LCamellia_SBOX(%rip),$Tbl
666 mov \$32,%ecx
667 .align 4
668 .Lcbc_prefetch_sbox:
669 mov 0($Tbl),%rax
670 mov 32($Tbl),%rsi
671 mov 64($Tbl),%rdi
672 mov 96($Tbl),%r11
673 lea 128($Tbl),$Tbl
674 loop .Lcbc_prefetch_sbox
675 sub \$4096,$Tbl
676 shl \$6,$keyend
677 mov %rdx,%rcx # len argument
678 lea ($key,$keyend),$keyend
680 cmp \$0,%r9d # enc argument
681 je .LCBC_DECRYPT
683 and \$-16,%rdx
684 and \$15,%rcx # length residue
685 lea ($inp,%rdx),%rdx
686 mov $key,$_key
687 mov %rdx,$_end
688 mov %rcx,$_res
690 cmp $inp,%rdx
691 mov 0(%rbx),@S[0] # load IV
692 mov 4(%rbx),@S[1]
693 mov 8(%rbx),@S[2]
694 mov 12(%rbx),@S[3]
695 je .Lcbc_enc_tail
696 jmp .Lcbc_eloop
698 .align 16
699 .Lcbc_eloop:
700 xor 0($inp),@S[0]
701 xor 4($inp),@S[1]
702 xor 8($inp),@S[2]
703 bswap @S[0]
704 xor 12($inp),@S[3]
705 bswap @S[1]
706 bswap @S[2]
707 bswap @S[3]
709 call _x86_64_Camellia_encrypt
711 mov $_key,$key # "rewind" the key
712 bswap @S[0]
713 mov $_end,%rdx
714 bswap @S[1]
715 mov $_res,%rcx
716 bswap @S[2]
717 mov @S[0],0($out)
718 bswap @S[3]
719 mov @S[1],4($out)
720 mov @S[2],8($out)
721 lea 16($inp),$inp
722 mov @S[3],12($out)
723 cmp %rdx,$inp
724 lea 16($out),$out
725 jne .Lcbc_eloop
727 cmp \$0,%rcx
728 jne .Lcbc_enc_tail
730 mov $_ivp,$out
731 mov @S[0],0($out) # write out IV residue
732 mov @S[1],4($out)
733 mov @S[2],8($out)
734 mov @S[3],12($out)
735 jmp .Lcbc_done
737 .align 16
738 .Lcbc_enc_tail:
739 xor %rax,%rax
740 mov %rax,0+$ivec
741 mov %rax,8+$ivec
742 mov %rax,$_res
744 .Lcbc_enc_pushf:
745 pushfq
747 mov $inp,%rsi
748 lea 8+$ivec,%rdi
749 .long 0x9066A4F3 # rep movsb
750 popfq
751 .Lcbc_enc_popf:
753 lea $ivec,$inp
754 lea 16+$ivec,%rax
755 mov %rax,$_end
756 jmp .Lcbc_eloop # one more time
758 .align 16
759 .LCBC_DECRYPT:
760 xchg $key,$keyend
761 add \$15,%rdx
762 and \$15,%rcx # length residue
763 and \$-16,%rdx
764 mov $key,$_key
765 lea ($inp,%rdx),%rdx
766 mov %rdx,$_end
767 mov %rcx,$_res
769 mov (%rbx),%rax # load IV
770 mov 8(%rbx),%rbx
771 jmp .Lcbc_dloop
772 .align 16
773 .Lcbc_dloop:
774 mov 0($inp),@S[0]
775 mov 4($inp),@S[1]
776 mov 8($inp),@S[2]
777 bswap @S[0]
778 mov 12($inp),@S[3]
779 bswap @S[1]
780 mov %rax,0+$ivec # save IV to temporary storage
781 bswap @S[2]
782 mov %rbx,8+$ivec
783 bswap @S[3]
785 call _x86_64_Camellia_decrypt
787 mov $_key,$key # "rewind" the key
788 mov $_end,%rdx
789 mov $_res,%rcx
791 bswap @S[0]
792 mov ($inp),%rax # load IV for next iteration
793 bswap @S[1]
794 mov 8($inp),%rbx
795 bswap @S[2]
796 xor 0+$ivec,@S[0]
797 bswap @S[3]
798 xor 4+$ivec,@S[1]
799 xor 8+$ivec,@S[2]
800 lea 16($inp),$inp
801 xor 12+$ivec,@S[3]
802 cmp %rdx,$inp
803 je .Lcbc_ddone
805 mov @S[0],0($out)
806 mov @S[1],4($out)
807 mov @S[2],8($out)
808 mov @S[3],12($out)
810 lea 16($out),$out
811 jmp .Lcbc_dloop
813 .align 16
814 .Lcbc_ddone:
815 mov $_ivp,%rdx
816 cmp \$0,%rcx
817 jne .Lcbc_dec_tail
819 mov @S[0],0($out)
820 mov @S[1],4($out)
821 mov @S[2],8($out)
822 mov @S[3],12($out)
824 mov %rax,(%rdx) # write out IV residue
825 mov %rbx,8(%rdx)
826 jmp .Lcbc_done
827 .align 16
828 .Lcbc_dec_tail:
829 mov @S[0],0+$ivec
830 mov @S[1],4+$ivec
831 mov @S[2],8+$ivec
832 mov @S[3],12+$ivec
834 .Lcbc_dec_pushf:
835 pushfq
837 lea 8+$ivec,%rsi
838 lea ($out),%rdi
839 .long 0x9066A4F3 # rep movsb
840 popfq
841 .Lcbc_dec_popf:
843 mov %rax,(%rdx) # write out IV residue
844 mov %rbx,8(%rdx)
845 jmp .Lcbc_done
847 .align 16
848 .Lcbc_done:
849 mov $_rsp,%rcx
850 mov 0(%rcx),%r15
851 mov 8(%rcx),%r14
852 mov 16(%rcx),%r13
853 mov 24(%rcx),%r12
854 mov 32(%rcx),%rbp
855 mov 40(%rcx),%rbx
856 lea 48(%rcx),%rsp
857 .Lcbc_abort:
859 .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
861 .asciz "Camellia for x86_64 by <appro\@openssl.org>"
865 $code =~ s/\`([^\`]*)\`/eval $1/gem;
866 print $code;
867 close STDOUT;