OpenSSL: Update to version 1.0.1e
[tomato.git] / release / src / router / openssl / crypto / camellia / asm / cmll-x86.pl
blobc314d62312f00c32f1e710858647a7f8d2910554
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
6 # This module may be used under the terms of either the GNU General
7 # Public License version 2 or later, the GNU Lesser General Public
8 # License version 2.1 or later, the Mozilla Public License version
9 # 1.1 or the BSD License. The exact terms of either license are
10 # distributed along with this module. For further details see
11 # http://www.openssl.org/~appro/camellia/.
12 # ====================================================================
14 # Performance in cycles per processed byte (less is better) in
15 # 'openssl speed ...' benchmark:
17 # AMD K8 Core2 PIII P4
18 # -evp camellia-128-ecb 21.5 22.8 27.0 28.9
19 # + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64%
20 # + over icc 8.0 +48/19% +21/15% +21/17% +55/37%
22 # camellia-128-cbc 17.3 21.1 23.9 25.9
24 # 128-bit key setup 196 280 256 240 cycles/key
25 # + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40%
26 # + over icc 8.0 +18/3% +10/0% +10/3% +21/10%
28 # Pairs of numbers in "+" rows represent performance improvement over
29 # compiler generated position-independent code, PIC, and non-PIC
30 # respectively. PIC results are of greater relevance, as this module
31 # is position-independent, i.e. suitable for a shared library or PIE.
32 # Position independence "costs" one register, which is why compilers
33 # are so close with non-PIC results, they have an extra register to
34 # spare. CBC results are better than ECB ones thanks to "zero-copy"
35 # private _x86_* interface, and are ~30-40% better than with compiler
36 # generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
37 # same CPU (where applicable).
39 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40 push(@INC,"${dir}","${dir}../../perlasm");
41 require "x86asm.pl";
43 $OPENSSL=1;
45 &asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386");
47 @T=("eax","ebx","ecx","edx");
48 $idx="esi";
49 $key="edi";
50 $Tbl="ebp";
52 # stack frame layout in _x86_Camellia_* routines, frame is allocated
53 # by caller
54 $__ra=&DWP(0,"esp"); # return address
55 $__s0=&DWP(4,"esp"); # s0 backing store
56 $__s1=&DWP(8,"esp"); # s1 backing store
57 $__s2=&DWP(12,"esp"); # s2 backing store
58 $__s3=&DWP(16,"esp"); # s3 backing store
59 $__end=&DWP(20,"esp"); # pointer to end/start of key schedule
61 # stack frame layout in Camellia_[en|crypt] routines, which differs from
62 # above by 4 and overlaps by pointer to end/start of key schedule
63 $_end=&DWP(16,"esp");
64 $_esp=&DWP(20,"esp");
66 # const unsigned int Camellia_SBOX[4][256];
67 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
68 # and [2][] - with [3][]. This is done to optimize code size.
69 $SBOX1_1110=0; # Camellia_SBOX[0]
70 $SBOX4_4404=4; # Camellia_SBOX[1]
71 $SBOX2_0222=2048; # Camellia_SBOX[2]
72 $SBOX3_3033=2052; # Camellia_SBOX[3]
73 &static_label("Camellia_SIGMA");
74 &static_label("Camellia_SBOX");
76 sub Camellia_Feistel {
77 my $i=@_[0];
78 my $seed=defined(@_[1])?@_[1]:0;
79 my $scale=$seed<0?-8:8;
80 my $frame=defined(@_[2])?@_[2]:0;
81 my $j=($i&1)*2;
82 my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
84 &xor ($t0,$idx); # t0^=key[0]
85 &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1]
86 &movz ($idx,&HB($t0)); # (t0>>8)&0xff
87 &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0]
88 &movz ($idx,&LB($t0)); # (t0>>0)&0xff
89 &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0]
90 &shr ($t0,16);
91 &movz ($idx,&LB($t1)); # (t1>>0)&0xff
92 &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1]
93 &movz ($idx,&HB($t0)); # (t0>>24)&0xff
94 &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0]
95 &movz ($idx,&HB($t1)); # (t1>>8)&0xff
96 &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1]
97 &shr ($t1,16);
98 &movz ($t0,&LB($t0)); # (t0>>16)&0xff
99 &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0]
100 &movz ($idx,&HB($t1)); # (t1>>24)&0xff
101 &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3"
102 &xor ($t2,$t3); # t2^=t3
103 &rotr ($t3,8); # t3=RightRotate(t3,8)
104 &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1]
105 &movz ($idx,&LB($t1)); # (t1>>16)&0xff
106 &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2"
107 &xor ($t3,$t0); # t3^=s3
108 &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1]
109 &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1]
110 &xor ($t3,$t2); # t3^=t2
111 &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3
112 &xor ($t2,$t1); # t2^=s2
113 &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2
116 # void Camellia_EncryptBlock_Rounds(
117 # int grandRounds,
118 # const Byte plaintext[],
119 # const KEY_TABLE_TYPE keyTable,
120 # Byte ciphertext[])
121 &function_begin("Camellia_EncryptBlock_Rounds");
122 &mov ("eax",&wparam(0)); # load grandRounds
123 &mov ($idx,&wparam(1)); # load plaintext pointer
124 &mov ($key,&wparam(2)); # load key schedule pointer
126 &mov ("ebx","esp");
127 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
128 &and ("esp",-64);
130 # place stack frame just "above mod 1024" the key schedule
131 # this ensures that cache associativity of 2 suffices
132 &lea ("ecx",&DWP(-64-63,$key));
133 &sub ("ecx","esp");
134 &neg ("ecx");
135 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
136 &sub ("esp","ecx");
137 &add ("esp",4); # 4 is reserved for callee's return address
139 &shl ("eax",6);
140 &lea ("eax",&DWP(0,$key,"eax"));
141 &mov ($_esp,"ebx"); # save %esp
142 &mov ($_end,"eax"); # save keyEnd
144 &call (&label("pic_point"));
145 &set_label("pic_point");
146 &blindpop($Tbl);
147 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
149 &mov (@T[0],&DWP(0,$idx)); # load plaintext
150 &mov (@T[1],&DWP(4,$idx));
151 &mov (@T[2],&DWP(8,$idx));
152 &bswap (@T[0]);
153 &mov (@T[3],&DWP(12,$idx));
154 &bswap (@T[1]);
155 &bswap (@T[2]);
156 &bswap (@T[3]);
158 &call ("_x86_Camellia_encrypt");
160 &mov ("esp",$_esp);
161 &bswap (@T[0]);
162 &mov ($idx,&wparam(3)); # load ciphertext pointer
163 &bswap (@T[1]);
164 &bswap (@T[2]);
165 &bswap (@T[3]);
166 &mov (&DWP(0,$idx),@T[0]); # write ciphertext
167 &mov (&DWP(4,$idx),@T[1]);
168 &mov (&DWP(8,$idx),@T[2]);
169 &mov (&DWP(12,$idx),@T[3]);
170 &function_end("Camellia_EncryptBlock_Rounds");
171 # V1.x API
172 &function_begin_B("Camellia_EncryptBlock");
173 &mov ("eax",128);
174 &sub ("eax",&wparam(0)); # load keyBitLength
175 &mov ("eax",3);
176 &adc ("eax",0); # keyBitLength==128?3:4
177 &mov (&wparam(0),"eax");
178 &jmp (&label("Camellia_EncryptBlock_Rounds"));
179 &function_end_B("Camellia_EncryptBlock");
181 if ($OPENSSL) {
182 # void Camellia_encrypt(
183 # const unsigned char *in,
184 # unsigned char *out,
185 # const CAMELLIA_KEY *key)
186 &function_begin("Camellia_encrypt");
187 &mov ($idx,&wparam(0)); # load plaintext pointer
188 &mov ($key,&wparam(2)); # load key schedule pointer
190 &mov ("ebx","esp");
191 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
192 &and ("esp",-64);
193 &mov ("eax",&DWP(272,$key)); # load grandRounds counter
195 # place stack frame just "above mod 1024" the key schedule
196 # this ensures that cache associativity of 2 suffices
197 &lea ("ecx",&DWP(-64-63,$key));
198 &sub ("ecx","esp");
199 &neg ("ecx");
200 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
201 &sub ("esp","ecx");
202 &add ("esp",4); # 4 is reserved for callee's return address
204 &shl ("eax",6);
205 &lea ("eax",&DWP(0,$key,"eax"));
206 &mov ($_esp,"ebx"); # save %esp
207 &mov ($_end,"eax"); # save keyEnd
209 &call (&label("pic_point"));
210 &set_label("pic_point");
211 &blindpop($Tbl);
212 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
214 &mov (@T[0],&DWP(0,$idx)); # load plaintext
215 &mov (@T[1],&DWP(4,$idx));
216 &mov (@T[2],&DWP(8,$idx));
217 &bswap (@T[0]);
218 &mov (@T[3],&DWP(12,$idx));
219 &bswap (@T[1]);
220 &bswap (@T[2]);
221 &bswap (@T[3]);
223 &call ("_x86_Camellia_encrypt");
225 &mov ("esp",$_esp);
226 &bswap (@T[0]);
227 &mov ($idx,&wparam(1)); # load ciphertext pointer
228 &bswap (@T[1]);
229 &bswap (@T[2]);
230 &bswap (@T[3]);
231 &mov (&DWP(0,$idx),@T[0]); # write ciphertext
232 &mov (&DWP(4,$idx),@T[1]);
233 &mov (&DWP(8,$idx),@T[2]);
234 &mov (&DWP(12,$idx),@T[3]);
235 &function_end("Camellia_encrypt");
238 &function_begin_B("_x86_Camellia_encrypt");
239 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
240 &xor (@T[1],&DWP(4,$key));
241 &xor (@T[2],&DWP(8,$key));
242 &xor (@T[3],&DWP(12,$key));
243 &mov ($idx,&DWP(16,$key)); # prefetch key[4]
245 &mov ($__s0,@T[0]); # save s[0-3]
246 &mov ($__s1,@T[1]);
247 &mov ($__s2,@T[2]);
248 &mov ($__s3,@T[3]);
250 &set_label("loop",16);
251 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }
253 &add ($key,16*4);
254 &cmp ($key,$__end);
255 &je (&label("done"));
257 # @T[0-1] are preloaded, $idx is preloaded with key[0]
258 &and ($idx,@T[0]);
259 &mov (@T[3],$__s3);
260 &rotl ($idx,1);
261 &mov (@T[2],@T[3]);
262 &xor (@T[1],$idx);
263 &or (@T[2],&DWP(12,$key));
264 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
265 &xor (@T[2],$__s2);
267 &mov ($idx,&DWP(4,$key));
268 &mov ($__s2,@T[2]); # s2^=s3|key[3];
269 &or ($idx,@T[1]);
270 &and (@T[2],&DWP(8,$key));
271 &xor (@T[0],$idx);
272 &rotl (@T[2],1);
273 &mov ($__s0,@T[0]); # s0^=s1|key[1];
274 &xor (@T[3],@T[2]);
275 &mov ($idx,&DWP(16,$key)); # prefetch key[4]
276 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
277 &jmp (&label("loop"));
279 &set_label("done",8);
280 &mov (@T[2],@T[0]); # SwapHalf
281 &mov (@T[3],@T[1]);
282 &mov (@T[0],$__s2);
283 &mov (@T[1],$__s3);
284 &xor (@T[0],$idx); # $idx is preloaded with key[0]
285 &xor (@T[1],&DWP(4,$key));
286 &xor (@T[2],&DWP(8,$key));
287 &xor (@T[3],&DWP(12,$key));
288 &ret ();
289 &function_end_B("_x86_Camellia_encrypt");
291 # void Camellia_DecryptBlock_Rounds(
292 # int grandRounds,
293 # const Byte ciphertext[],
294 # const KEY_TABLE_TYPE keyTable,
295 # Byte plaintext[])
296 &function_begin("Camellia_DecryptBlock_Rounds");
297 &mov ("eax",&wparam(0)); # load grandRounds
298 &mov ($idx,&wparam(1)); # load ciphertext pointer
299 &mov ($key,&wparam(2)); # load key schedule pointer
301 &mov ("ebx","esp");
302 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
303 &and ("esp",-64);
305 # place stack frame just "above mod 1024" the key schedule
306 # this ensures that cache associativity of 2 suffices
307 &lea ("ecx",&DWP(-64-63,$key));
308 &sub ("ecx","esp");
309 &neg ("ecx");
310 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
311 &sub ("esp","ecx");
312 &add ("esp",4); # 4 is reserved for callee's return address
314 &shl ("eax",6);
315 &mov (&DWP(4*4,"esp"),$key); # save keyStart
316 &lea ($key,&DWP(0,$key,"eax"));
317 &mov (&DWP(5*4,"esp"),"ebx");# save %esp
319 &call (&label("pic_point"));
320 &set_label("pic_point");
321 &blindpop($Tbl);
322 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
324 &mov (@T[0],&DWP(0,$idx)); # load ciphertext
325 &mov (@T[1],&DWP(4,$idx));
326 &mov (@T[2],&DWP(8,$idx));
327 &bswap (@T[0]);
328 &mov (@T[3],&DWP(12,$idx));
329 &bswap (@T[1]);
330 &bswap (@T[2]);
331 &bswap (@T[3]);
333 &call ("_x86_Camellia_decrypt");
335 &mov ("esp",&DWP(5*4,"esp"));
336 &bswap (@T[0]);
337 &mov ($idx,&wparam(3)); # load plaintext pointer
338 &bswap (@T[1]);
339 &bswap (@T[2]);
340 &bswap (@T[3]);
341 &mov (&DWP(0,$idx),@T[0]); # write plaintext
342 &mov (&DWP(4,$idx),@T[1]);
343 &mov (&DWP(8,$idx),@T[2]);
344 &mov (&DWP(12,$idx),@T[3]);
345 &function_end("Camellia_DecryptBlock_Rounds");
346 # V1.x API
347 &function_begin_B("Camellia_DecryptBlock");
348 &mov ("eax",128);
349 &sub ("eax",&wparam(0)); # load keyBitLength
350 &mov ("eax",3);
351 &adc ("eax",0); # keyBitLength==128?3:4
352 &mov (&wparam(0),"eax");
353 &jmp (&label("Camellia_DecryptBlock_Rounds"));
354 &function_end_B("Camellia_DecryptBlock");
356 if ($OPENSSL) {
357 # void Camellia_decrypt(
358 # const unsigned char *in,
359 # unsigned char *out,
360 # const CAMELLIA_KEY *key)
361 &function_begin("Camellia_decrypt");
362 &mov ($idx,&wparam(0)); # load ciphertext pointer
363 &mov ($key,&wparam(2)); # load key schedule pointer
365 &mov ("ebx","esp");
366 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
367 &and ("esp",-64);
368 &mov ("eax",&DWP(272,$key)); # load grandRounds counter
370 # place stack frame just "above mod 1024" the key schedule
371 # this ensures that cache associativity of 2 suffices
372 &lea ("ecx",&DWP(-64-63,$key));
373 &sub ("ecx","esp");
374 &neg ("ecx");
375 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
376 &sub ("esp","ecx");
377 &add ("esp",4); # 4 is reserved for callee's return address
379 &shl ("eax",6);
380 &mov (&DWP(4*4,"esp"),$key); # save keyStart
381 &lea ($key,&DWP(0,$key,"eax"));
382 &mov (&DWP(5*4,"esp"),"ebx");# save %esp
384 &call (&label("pic_point"));
385 &set_label("pic_point");
386 &blindpop($Tbl);
387 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
389 &mov (@T[0],&DWP(0,$idx)); # load ciphertext
390 &mov (@T[1],&DWP(4,$idx));
391 &mov (@T[2],&DWP(8,$idx));
392 &bswap (@T[0]);
393 &mov (@T[3],&DWP(12,$idx));
394 &bswap (@T[1]);
395 &bswap (@T[2]);
396 &bswap (@T[3]);
398 &call ("_x86_Camellia_decrypt");
400 &mov ("esp",&DWP(5*4,"esp"));
401 &bswap (@T[0]);
402 &mov ($idx,&wparam(1)); # load plaintext pointer
403 &bswap (@T[1]);
404 &bswap (@T[2]);
405 &bswap (@T[3]);
406 &mov (&DWP(0,$idx),@T[0]); # write plaintext
407 &mov (&DWP(4,$idx),@T[1]);
408 &mov (&DWP(8,$idx),@T[2]);
409 &mov (&DWP(12,$idx),@T[3]);
410 &function_end("Camellia_decrypt");
413 &function_begin_B("_x86_Camellia_decrypt");
414 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3]
415 &xor (@T[1],&DWP(4,$key));
416 &xor (@T[2],&DWP(8,$key));
417 &xor (@T[3],&DWP(12,$key));
418 &mov ($idx,&DWP(-8,$key)); # prefetch key[-2]
420 &mov ($__s0,@T[0]); # save s[0-3]
421 &mov ($__s1,@T[1]);
422 &mov ($__s2,@T[2]);
423 &mov ($__s3,@T[3]);
425 &set_label("loop",16);
426 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); }
428 &sub ($key,16*4);
429 &cmp ($key,$__end);
430 &je (&label("done"));
432 # @T[0-1] are preloaded, $idx is preloaded with key[2]
433 &and ($idx,@T[0]);
434 &mov (@T[3],$__s3);
435 &rotl ($idx,1);
436 &mov (@T[2],@T[3]);
437 &xor (@T[1],$idx);
438 &or (@T[2],&DWP(4,$key));
439 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
440 &xor (@T[2],$__s2);
442 &mov ($idx,&DWP(12,$key));
443 &mov ($__s2,@T[2]); # s2^=s3|key[3];
444 &or ($idx,@T[1]);
445 &and (@T[2],&DWP(0,$key));
446 &xor (@T[0],$idx);
447 &rotl (@T[2],1);
448 &mov ($__s0,@T[0]); # s0^=s1|key[1];
449 &xor (@T[3],@T[2]);
450 &mov ($idx,&DWP(-8,$key)); # prefetch key[4]
451 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
452 &jmp (&label("loop"));
454 &set_label("done",8);
455 &mov (@T[2],@T[0]); # SwapHalf
456 &mov (@T[3],@T[1]);
457 &mov (@T[0],$__s2);
458 &mov (@T[1],$__s3);
459 &xor (@T[2],$idx); # $idx is preloaded with key[2]
460 &xor (@T[3],&DWP(12,$key));
461 &xor (@T[0],&DWP(0,$key));
462 &xor (@T[1],&DWP(4,$key));
463 &ret ();
464 &function_end_B("_x86_Camellia_decrypt");
466 # shld is very slow on Intel P4 family. Even on AMD it limits
467 # instruction decode rate [because it's VectorPath] and consequently
468 # performance. PIII, PM and Core[2] seem to be the only ones which
469 # execute this code ~7% faster...
470 sub __rotl128 {
471 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
473 $rnd *= 2;
474 if ($rot) {
475 &mov ($idx,$i0);
476 &shld ($i0,$i1,$rot);
477 &shld ($i1,$i2,$rot);
478 &shld ($i2,$i3,$rot);
479 &shld ($i3,$idx,$rot);
481 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
482 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
483 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
484 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
487 # ... Implementing 128-bit rotate without shld gives >3x performance
488 # improvement on P4, only ~7% degradation on other Intel CPUs and
489 # not worse performance on AMD. This is therefore preferred.
490 sub _rotl128 {
491 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
493 $rnd *= 2;
494 if ($rot) {
495 &mov ($Tbl,$i0);
496 &shl ($i0,$rot);
497 &mov ($idx,$i1);
498 &shr ($idx,32-$rot);
499 &shl ($i1,$rot);
500 &or ($i0,$idx);
501 &mov ($idx,$i2);
502 &shl ($i2,$rot);
503 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
504 &shr ($idx,32-$rot);
505 &or ($i1,$idx);
506 &shr ($Tbl,32-$rot);
507 &mov ($idx,$i3);
508 &shr ($idx,32-$rot);
509 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
510 &shl ($i3,$rot);
511 &or ($i2,$idx);
512 &or ($i3,$Tbl);
513 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
514 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
515 } else {
516 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
517 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
518 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
519 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
523 sub _saveround {
524 my ($rnd,$key,@T)=@_;
525 my $bias=int(@T[0])?shift(@T):0;
527 &mov (&DWP($bias+$rnd*8+0,$key),@T[0]);
528 &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1);
529 &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2);
530 &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3);
533 sub _loadround {
534 my ($rnd,$key,@T)=@_;
535 my $bias=int(@T[0])?shift(@T):0;
537 &mov (@T[0],&DWP($bias+$rnd*8+0,$key));
538 &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1);
539 &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2);
540 &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3);
543 # void Camellia_Ekeygen(
544 # const int keyBitLength,
545 # const Byte *rawKey,
546 # KEY_TABLE_TYPE keyTable)
547 &function_begin("Camellia_Ekeygen");
548 { my $step=0;
550 &stack_push(4); # place for s[0-3]
552 &mov ($Tbl,&wparam(0)); # load arguments
553 &mov ($idx,&wparam(1));
554 &mov ($key,&wparam(2));
556 &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits
557 &mov (@T[1],&DWP(4,$idx));
558 &mov (@T[2],&DWP(8,$idx));
559 &mov (@T[3],&DWP(12,$idx));
561 &bswap (@T[0]);
562 &bswap (@T[1]);
563 &bswap (@T[2]);
564 &bswap (@T[3]);
566 &_saveround (0,$key,@T); # KL<<<0
568 &cmp ($Tbl,128);
569 &je (&label("1st128"));
571 &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits
572 &mov (@T[1],&DWP(20,$idx));
573 &cmp ($Tbl,192);
574 &je (&label("1st192"));
575 &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits
576 &mov (@T[3],&DWP(28,$idx));
577 &jmp (&label("1st256"));
578 &set_label("1st192",4);
579 &mov (@T[2],@T[0]);
580 &mov (@T[3],@T[1]);
581 &not (@T[2]);
582 &not (@T[3]);
583 &set_label("1st256",4);
584 &bswap (@T[0]);
585 &bswap (@T[1]);
586 &bswap (@T[2]);
587 &bswap (@T[3]);
589 &_saveround (4,$key,@T); # temporary storage for KR!
591 &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL
592 &xor (@T[1],&DWP(0*8+4,$key));
593 &xor (@T[2],&DWP(1*8+0,$key));
594 &xor (@T[3],&DWP(1*8+4,$key));
596 &set_label("1st128",4);
597 &call (&label("pic_point"));
598 &set_label("pic_point");
599 &blindpop($Tbl);
600 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
601 &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl));
603 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0]
604 &mov (&swtmp(0),@T[0]); # save s[0-3]
605 &mov (&swtmp(1),@T[1]);
606 &mov (&swtmp(2),@T[2]);
607 &mov (&swtmp(3),@T[3]);
608 &Camellia_Feistel($step++);
609 &Camellia_Feistel($step++);
610 &mov (@T[2],&swtmp(2));
611 &mov (@T[3],&swtmp(3));
613 &mov ($idx,&wparam(2));
614 &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL
615 &xor (@T[1],&DWP(0*8+4,$idx));
616 &xor (@T[2],&DWP(1*8+0,$idx));
617 &xor (@T[3],&DWP(1*8+4,$idx));
619 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4]
620 &mov (&swtmp(0),@T[0]); # save s[0-3]
621 &mov (&swtmp(1),@T[1]);
622 &mov (&swtmp(2),@T[2]);
623 &mov (&swtmp(3),@T[3]);
624 &Camellia_Feistel($step++);
625 &Camellia_Feistel($step++);
626 &mov (@T[2],&swtmp(2));
627 &mov (@T[3],&swtmp(3));
629 &mov ($idx,&wparam(0));
630 &cmp ($idx,128);
631 &jne (&label("2nd256"));
633 &mov ($key,&wparam(2));
634 &lea ($key,&DWP(128,$key)); # size optimization
636 ####### process KA
637 &_saveround (2,$key,-128,@T); # KA<<<0
638 &_rotl128 (@T,15,6,@T); # KA<<<15
639 &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30)
640 &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45)
641 &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60)
642 push (@T,shift(@T)); # rotl128(@T,32);
643 &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94)
644 &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111)
646 ####### process KL
647 &_loadround (0,$key,-128,@T); # load KL
648 &_rotl128 (@T,15,4,@T); # KL<<<15
649 &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45)
650 &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60)
651 &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77)
652 &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94)
653 &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111)
655 while (@T[0] ne "eax") # restore order
656 { unshift (@T,pop(@T)); }
658 &mov ("eax",3); # 3 grandRounds
659 &jmp (&label("done"));
661 &set_label("2nd256",16);
662 &mov ($idx,&wparam(2));
663 &_saveround (6,$idx,@T); # temporary storage for KA!
665 &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR
666 &xor (@T[1],&DWP(4*8+4,$idx));
667 &xor (@T[2],&DWP(5*8+0,$idx));
668 &xor (@T[3],&DWP(5*8+4,$idx));
670 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8]
671 &mov (&swtmp(0),@T[0]); # save s[0-3]
672 &mov (&swtmp(1),@T[1]);
673 &mov (&swtmp(2),@T[2]);
674 &mov (&swtmp(3),@T[3]);
675 &Camellia_Feistel($step++);
676 &Camellia_Feistel($step++);
677 &mov (@T[2],&swtmp(2));
678 &mov (@T[3],&swtmp(3));
680 &mov ($key,&wparam(2));
681 &lea ($key,&DWP(128,$key)); # size optimization
683 ####### process KB
684 &_saveround (2,$key,-128,@T); # KB<<<0
685 &_rotl128 (@T,30,10,@T); # KB<<<30
686 &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60)
687 push (@T,shift(@T)); # rotl128(@T,32);
688 &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111)
690 ####### process KR
691 &_loadround (4,$key,-128,@T); # load KR
692 &_rotl128 (@T,15,4,@T); # KR<<<15
693 &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30)
694 &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60)
695 push (@T,shift(@T)); # rotl128(@T,32);
696 &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94)
698 ####### process KA
699 &_loadround (6,$key,-128,@T); # load KA
700 &_rotl128 (@T,15,6,@T); # KA<<<15
701 &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45)
702 push (@T,shift(@T)); # rotl128(@T,32);
703 &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77)
704 &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94)
706 ####### process KL
707 &_loadround (0,$key,-128,@T); # load KL
708 push (@T,shift(@T)); # rotl128(@T,32);
709 &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45)
710 &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60)
711 &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77)
712 push (@T,shift(@T)); # rotl128(@T,32);
713 &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111)
715 while (@T[0] ne "eax") # restore order
716 { unshift (@T,pop(@T)); }
718 &mov ("eax",4); # 4 grandRounds
719 &set_label("done");
720 &lea ("edx",&DWP(272-128,$key)); # end of key schedule
721 &stack_pop(4);
723 &function_end("Camellia_Ekeygen");
725 if ($OPENSSL) {
726 # int private_Camellia_set_key (
727 # const unsigned char *userKey,
728 # int bits,
729 # CAMELLIA_KEY *key)
730 &function_begin_B("private_Camellia_set_key");
731 &push ("ebx");
732 &mov ("ecx",&wparam(0)); # pull arguments
733 &mov ("ebx",&wparam(1));
734 &mov ("edx",&wparam(2));
736 &mov ("eax",-1);
737 &test ("ecx","ecx");
738 &jz (&label("done")); # userKey==NULL?
739 &test ("edx","edx");
740 &jz (&label("done")); # key==NULL?
742 &mov ("eax",-2);
743 &cmp ("ebx",256);
744 &je (&label("arg_ok")); # bits==256?
745 &cmp ("ebx",192);
746 &je (&label("arg_ok")); # bits==192?
747 &cmp ("ebx",128);
748 &jne (&label("done")); # bits!=128?
749 &set_label("arg_ok",4);
751 &push ("edx"); # push arguments
752 &push ("ecx");
753 &push ("ebx");
754 &call ("Camellia_Ekeygen");
755 &stack_pop(3);
757 # eax holds grandRounds and edx points at where to put it
758 &mov (&DWP(0,"edx"),"eax");
759 &xor ("eax","eax");
760 &set_label("done",4);
761 &pop ("ebx");
762 &ret ();
763 &function_end_B("private_Camellia_set_key");
766 @SBOX=(
767 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
768 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
769 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
770 166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
771 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
772 223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
773 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
774 254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
775 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
776 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
777 135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
778 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
779 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
780 120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
781 114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
782 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
784 sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
785 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
786 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
787 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
789 &set_label("Camellia_SIGMA",64);
790 &data_word(
791 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2,
792 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c,
793 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd,
794 0, 0, 0, 0);
795 &set_label("Camellia_SBOX",64);
796 # tables are interleaved, remember?
797 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
798 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
800 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
801 # size_t length, const CAMELLIA_KEY *key,
802 # unsigned char *ivp,const int enc);
804 # stack frame layout
805 # -4(%esp) # return address 0(%esp)
806 # 0(%esp) # s0 4(%esp)
807 # 4(%esp) # s1 8(%esp)
808 # 8(%esp) # s2 12(%esp)
809 # 12(%esp) # s3 16(%esp)
810 # 16(%esp) # end of key schedule 20(%esp)
811 # 20(%esp) # %esp backup
812 my $_inp=&DWP(24,"esp"); #copy of wparam(0)
813 my $_out=&DWP(28,"esp"); #copy of wparam(1)
814 my $_len=&DWP(32,"esp"); #copy of wparam(2)
815 my $_key=&DWP(36,"esp"); #copy of wparam(3)
816 my $_ivp=&DWP(40,"esp"); #copy of wparam(4)
817 my $ivec=&DWP(44,"esp"); #ivec[16]
818 my $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec]
819 my ($s0,$s1,$s2,$s3) = @T;
821 &function_begin("Camellia_cbc_encrypt");
822 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
823 &cmp ($s2,0);
824 &je (&label("enc_out"));
826 &pushf ();
827 &cld ();
829 &mov ($s0,&wparam(0)); # load inp
830 &mov ($s1,&wparam(1)); # load out
831 #&mov ($s2,&wparam(2)); # load len
832 &mov ($s3,&wparam(3)); # load key
833 &mov ($Tbl,&wparam(4)); # load ivp
835 # allocate aligned stack frame...
836 &lea ($idx,&DWP(-64,"esp"));
837 &and ($idx,-64);
839 # place stack frame just "above mod 1024" the key schedule
840 # this ensures that cache associativity of 2 suffices
841 &lea ($key,&DWP(-64-63,$s3));
842 &sub ($key,$idx);
843 &neg ($key);
844 &and ($key,0x3C0); # modulo 1024, but aligned to cache-line
845 &sub ($idx,$key);
847 &mov ($key,&wparam(5)); # load enc
849 &exch ("esp",$idx);
850 &add ("esp",4); # reserve for return address!
851 &mov ($_esp,$idx); # save %esp
853 &mov ($_inp,$s0); # save copy of inp
854 &mov ($_out,$s1); # save copy of out
855 &mov ($_len,$s2); # save copy of len
856 &mov ($_key,$s3); # save copy of key
857 &mov ($_ivp,$Tbl); # save copy of ivp
859 &call (&label("pic_point")); # make it PIC!
860 &set_label("pic_point");
861 &blindpop($Tbl);
862 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
864 &mov ($idx,32);
865 &set_label("prefetch_sbox",4);
866 &mov ($s0,&DWP(0,$Tbl));
867 &mov ($s1,&DWP(32,$Tbl));
868 &mov ($s2,&DWP(64,$Tbl));
869 &mov ($s3,&DWP(96,$Tbl));
870 &lea ($Tbl,&DWP(128,$Tbl));
871 &dec ($idx);
872 &jnz (&label("prefetch_sbox"));
873 &mov ($s0,$_key);
874 &sub ($Tbl,4096);
875 &mov ($idx,$_inp);
876 &mov ($s3,&DWP(272,$s0)); # load grandRounds
878 &cmp ($key,0);
879 &je (&label("DECRYPT"));
881 &mov ($s2,$_len);
882 &mov ($key,$_ivp);
883 &shl ($s3,6);
884 &lea ($s3,&DWP(0,$s0,$s3));
885 &mov ($_end,$s3);
887 &test ($s2,0xFFFFFFF0);
888 &jz (&label("enc_tail")); # short input...
890 &mov ($s0,&DWP(0,$key)); # load iv
891 &mov ($s1,&DWP(4,$key));
893 &set_label("enc_loop",4);
894 &mov ($s2,&DWP(8,$key));
895 &mov ($s3,&DWP(12,$key));
897 &xor ($s0,&DWP(0,$idx)); # xor input data
898 &xor ($s1,&DWP(4,$idx));
899 &xor ($s2,&DWP(8,$idx));
900 &bswap ($s0);
901 &xor ($s3,&DWP(12,$idx));
902 &bswap ($s1);
903 &mov ($key,$_key); # load key
904 &bswap ($s2);
905 &bswap ($s3);
907 &call ("_x86_Camellia_encrypt");
909 &mov ($idx,$_inp); # load inp
910 &mov ($key,$_out); # load out
912 &bswap ($s0);
913 &bswap ($s1);
914 &bswap ($s2);
915 &mov (&DWP(0,$key),$s0); # save output data
916 &bswap ($s3);
917 &mov (&DWP(4,$key),$s1);
918 &mov (&DWP(8,$key),$s2);
919 &mov (&DWP(12,$key),$s3);
921 &mov ($s2,$_len); # load len
923 &lea ($idx,&DWP(16,$idx));
924 &mov ($_inp,$idx); # save inp
926 &lea ($s3,&DWP(16,$key));
927 &mov ($_out,$s3); # save out
929 &sub ($s2,16);
930 &test ($s2,0xFFFFFFF0);
931 &mov ($_len,$s2); # save len
932 &jnz (&label("enc_loop"));
933 &test ($s2,15);
934 &jnz (&label("enc_tail"));
935 &mov ($idx,$_ivp); # load ivp
936 &mov ($s2,&DWP(8,$key)); # restore last dwords
937 &mov ($s3,&DWP(12,$key));
938 &mov (&DWP(0,$idx),$s0); # save ivec
939 &mov (&DWP(4,$idx),$s1);
940 &mov (&DWP(8,$idx),$s2);
941 &mov (&DWP(12,$idx),$s3);
943 &mov ("esp",$_esp);
944 &popf ();
945 &set_label("enc_out");
946 &function_end_A();
947 &pushf (); # kludge, never executed
949 &set_label("enc_tail",4);
950 &mov ($s0,$key eq "edi" ? $key : "");
951 &mov ($key,$_out); # load out
952 &push ($s0); # push ivp
953 &mov ($s1,16);
954 &sub ($s1,$s2);
955 &cmp ($key,$idx); # compare with inp
956 &je (&label("enc_in_place"));
957 &align (4);
958 &data_word(0xA4F3F689); # rep movsb # copy input
959 &jmp (&label("enc_skip_in_place"));
960 &set_label("enc_in_place");
961 &lea ($key,&DWP(0,$key,$s2));
962 &set_label("enc_skip_in_place");
963 &mov ($s2,$s1);
964 &xor ($s0,$s0);
965 &align (4);
966 &data_word(0xAAF3F689); # rep stosb # zero tail
967 &pop ($key); # pop ivp
969 &mov ($idx,$_out); # output as input
970 &mov ($s0,&DWP(0,$key));
971 &mov ($s1,&DWP(4,$key));
972 &mov ($_len,16); # len=16
973 &jmp (&label("enc_loop")); # one more spin...
975 #----------------------------- DECRYPT -----------------------------#
976 &set_label("DECRYPT",16);
977 &shl ($s3,6);
978 &lea ($s3,&DWP(0,$s0,$s3));
979 &mov ($_end,$s0);
980 &mov ($_key,$s3);
982 &cmp ($idx,$_out);
983 &je (&label("dec_in_place")); # in-place processing...
985 &mov ($key,$_ivp); # load ivp
986 &mov ($_tmp,$key);
988 &set_label("dec_loop",4);
989 &mov ($s0,&DWP(0,$idx)); # read input
990 &mov ($s1,&DWP(4,$idx));
991 &mov ($s2,&DWP(8,$idx));
992 &bswap ($s0);
993 &mov ($s3,&DWP(12,$idx));
994 &bswap ($s1);
995 &mov ($key,$_key); # load key
996 &bswap ($s2);
997 &bswap ($s3);
999 &call ("_x86_Camellia_decrypt");
1001 &mov ($key,$_tmp); # load ivp
1002 &mov ($idx,$_len); # load len
1004 &bswap ($s0);
1005 &bswap ($s1);
1006 &bswap ($s2);
1007 &xor ($s0,&DWP(0,$key)); # xor iv
1008 &bswap ($s3);
1009 &xor ($s1,&DWP(4,$key));
1010 &xor ($s2,&DWP(8,$key));
1011 &xor ($s3,&DWP(12,$key));
1013 &sub ($idx,16);
1014 &jc (&label("dec_partial"));
1015 &mov ($_len,$idx); # save len
1016 &mov ($idx,$_inp); # load inp
1017 &mov ($key,$_out); # load out
1019 &mov (&DWP(0,$key),$s0); # write output
1020 &mov (&DWP(4,$key),$s1);
1021 &mov (&DWP(8,$key),$s2);
1022 &mov (&DWP(12,$key),$s3);
1024 &mov ($_tmp,$idx); # save ivp
1025 &lea ($idx,&DWP(16,$idx));
1026 &mov ($_inp,$idx); # save inp
1028 &lea ($key,&DWP(16,$key));
1029 &mov ($_out,$key); # save out
1031 &jnz (&label("dec_loop"));
1032 &mov ($key,$_tmp); # load temp ivp
1033 &set_label("dec_end");
1034 &mov ($idx,$_ivp); # load user ivp
1035 &mov ($s0,&DWP(0,$key)); # load iv
1036 &mov ($s1,&DWP(4,$key));
1037 &mov ($s2,&DWP(8,$key));
1038 &mov ($s3,&DWP(12,$key));
1039 &mov (&DWP(0,$idx),$s0); # copy back to user
1040 &mov (&DWP(4,$idx),$s1);
1041 &mov (&DWP(8,$idx),$s2);
1042 &mov (&DWP(12,$idx),$s3);
1043 &jmp (&label("dec_out"));
1045 &set_label("dec_partial",4);
1046 &lea ($key,$ivec);
1047 &mov (&DWP(0,$key),$s0); # dump output to stack
1048 &mov (&DWP(4,$key),$s1);
1049 &mov (&DWP(8,$key),$s2);
1050 &mov (&DWP(12,$key),$s3);
1051 &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx));
1052 &mov ($idx eq "esi" ? $idx : "",$key);
1053 &mov ($key eq "edi" ? $key : "",$_out); # load out
1054 &data_word(0xA4F3F689); # rep movsb # copy output
1055 &mov ($key,$_inp); # use inp as temp ivp
1056 &jmp (&label("dec_end"));
1058 &set_label("dec_in_place",4);
1059 &set_label("dec_in_place_loop");
1060 &lea ($key,$ivec);
1061 &mov ($s0,&DWP(0,$idx)); # read input
1062 &mov ($s1,&DWP(4,$idx));
1063 &mov ($s2,&DWP(8,$idx));
1064 &mov ($s3,&DWP(12,$idx));
1066 &mov (&DWP(0,$key),$s0); # copy to temp
1067 &mov (&DWP(4,$key),$s1);
1068 &mov (&DWP(8,$key),$s2);
1069 &bswap ($s0);
1070 &mov (&DWP(12,$key),$s3);
1071 &bswap ($s1);
1072 &mov ($key,$_key); # load key
1073 &bswap ($s2);
1074 &bswap ($s3);
1076 &call ("_x86_Camellia_decrypt");
1078 &mov ($key,$_ivp); # load ivp
1079 &mov ($idx,$_out); # load out
1081 &bswap ($s0);
1082 &bswap ($s1);
1083 &bswap ($s2);
1084 &xor ($s0,&DWP(0,$key)); # xor iv
1085 &bswap ($s3);
1086 &xor ($s1,&DWP(4,$key));
1087 &xor ($s2,&DWP(8,$key));
1088 &xor ($s3,&DWP(12,$key));
1090 &mov (&DWP(0,$idx),$s0); # write output
1091 &mov (&DWP(4,$idx),$s1);
1092 &mov (&DWP(8,$idx),$s2);
1093 &mov (&DWP(12,$idx),$s3);
1095 &lea ($idx,&DWP(16,$idx));
1096 &mov ($_out,$idx); # save out
1098 &lea ($idx,$ivec);
1099 &mov ($s0,&DWP(0,$idx)); # read temp
1100 &mov ($s1,&DWP(4,$idx));
1101 &mov ($s2,&DWP(8,$idx));
1102 &mov ($s3,&DWP(12,$idx));
1104 &mov (&DWP(0,$key),$s0); # copy iv
1105 &mov (&DWP(4,$key),$s1);
1106 &mov (&DWP(8,$key),$s2);
1107 &mov (&DWP(12,$key),$s3);
1109 &mov ($idx,$_inp); # load inp
1111 &lea ($idx,&DWP(16,$idx));
1112 &mov ($_inp,$idx); # save inp
1114 &mov ($s2,$_len); # load len
1115 &sub ($s2,16);
1116 &jc (&label("dec_in_place_partial"));
1117 &mov ($_len,$s2); # save len
1118 &jnz (&label("dec_in_place_loop"));
1119 &jmp (&label("dec_out"));
1121 &set_label("dec_in_place_partial",4);
1122 # one can argue if this is actually required...
1123 &mov ($key eq "edi" ? $key : "",$_out);
1124 &lea ($idx eq "esi" ? $idx : "",$ivec);
1125 &lea ($key,&DWP(0,$key,$s2));
1126 &lea ($idx,&DWP(16,$idx,$s2));
1127 &neg ($s2 eq "ecx" ? $s2 : "");
1128 &data_word(0xA4F3F689); # rep movsb # restore tail
1130 &set_label("dec_out",4);
1131 &mov ("esp",$_esp);
1132 &popf ();
1133 &function_end("Camellia_cbc_encrypt");
1136 &asciz("Camellia for x86 by <appro\@openssl.org>");
1138 &asm_finish();