OpenSSL: update to 1.0.2a
[tomato.git] / release / src / router / openssl / crypto / sha / asm / sha512-sparcv9.pl
blob5a9c15d1d34abecc205ad6a1ab49a07d2da6a5df
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
9 # Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
10 # ====================================================================
12 # SHA256 performance improvement over compiler generated code varies
13 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
14 # build]. Just like in SHA1 module I aim to ensure scalability on
15 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
17 # SHA512 on pre-T1 UltraSPARC.
19 # Performance is >75% better than 64-bit code generated by Sun C and
20 # over 2x than 32-bit code. X[16] resides on stack, but access to it
21 # is scheduled for L2 latency and staged through 32 least significant
22 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
23 # duality. Nevetheless it's ~40% faster than SHA256, which is pretty
24 # good [optimal coefficient is 50%].
26 # SHA512 on UltraSPARC T1.
28 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
29 # because 64-bit code generator has the advantage of using 64-bit
30 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
31 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
32 # code by 60%, not to mention that it doesn't suffer from severe decay
33 # when running 4 times physical cores threads and that it leaves gcc
34 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
35 # performance is only 10% better, but overall throughput for maximum
36 # amount of threads for given CPU exceeds corresponding one of SHA256
37 # by 30% [again, optimal coefficient is 50%].
39 # (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
40 # in-order, i.e. load instruction has to complete prior next
41 # instruction in given thread is executed, even if the latter is
42 # not dependent on load result! This means that on T1 two 32-bit
43 # loads are always slower than one 64-bit load. Once again this
44 # is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
45 # 2x32-bit loads can be as fast as 1x64-bit ones.
47 # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
48 # which is 9.3x/11.1x faster than software. Multi-process benchmark
49 # saturates at 11.5x single-process result on 8-core processor, or
50 # ~11/16GBps per 2.85GHz socket.
52 $output=shift;
53 open STDOUT,">$output";
55 if ($output =~ /512/) {
56 $label="512";
57 $SZ=8;
58 $LD="ldx"; # load from memory
59 $ST="stx"; # store to memory
60 $SLL="sllx"; # shift left logical
61 $SRL="srlx"; # shift right logical
62 @Sigma0=(28,34,39);
63 @Sigma1=(14,18,41);
64 @sigma0=( 7, 1, 8); # right shift first
65 @sigma1=( 6,19,61); # right shift first
66 $lastK=0x817;
67 $rounds=80;
68 $align=4;
70 $locals=16*$SZ; # X[16]
72 $A="%o0";
73 $B="%o1";
74 $C="%o2";
75 $D="%o3";
76 $E="%o4";
77 $F="%o5";
78 $G="%g1";
79 $H="%o7";
80 @V=($A,$B,$C,$D,$E,$F,$G,$H);
81 } else {
82 $label="256";
83 $SZ=4;
84 $LD="ld"; # load from memory
85 $ST="st"; # store to memory
86 $SLL="sll"; # shift left logical
87 $SRL="srl"; # shift right logical
88 @Sigma0=( 2,13,22);
89 @Sigma1=( 6,11,25);
90 @sigma0=( 3, 7,18); # right shift first
91 @sigma1=(10,17,19); # right shift first
92 $lastK=0x8f2;
93 $rounds=64;
94 $align=8;
96 $locals=0; # X[16] is register resident
97 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
99 $A="%l0";
100 $B="%l1";
101 $C="%l2";
102 $D="%l3";
103 $E="%l4";
104 $F="%l5";
105 $G="%l6";
106 $H="%l7";
107 @V=($A,$B,$C,$D,$E,$F,$G,$H);
109 $T1="%g2";
110 $tmp0="%g3";
111 $tmp1="%g4";
112 $tmp2="%g5";
114 $ctx="%i0";
115 $inp="%i1";
116 $len="%i2";
117 $Ktbl="%i3";
118 $tmp31="%i4";
119 $tmp32="%i5";
121 ########### SHA256
122 $Xload = sub {
123 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
125 if ($i==0) {
126 $code.=<<___;
127 ldx [$inp+0],@X[0]
128 ldx [$inp+16],@X[2]
129 ldx [$inp+32],@X[4]
130 ldx [$inp+48],@X[6]
131 ldx [$inp+8],@X[1]
132 ldx [$inp+24],@X[3]
133 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
134 ldx [$inp+40],@X[5]
135 bz,pt %icc,.Laligned
136 ldx [$inp+56],@X[7]
138 sllx @X[0],$tmp31,@X[0]
139 ldx [$inp+64],$T1
141 for($j=0;$j<7;$j++)
142 { $code.=<<___;
143 srlx @X[$j+1],$tmp32,$tmp1
144 sllx @X[$j+1],$tmp31,@X[$j+1]
145 or $tmp1,@X[$j],@X[$j]
148 $code.=<<___;
149 srlx $T1,$tmp32,$T1
150 or $T1,@X[7],@X[7]
151 .Laligned:
155 if ($i&1) {
156 $code.="\tadd @X[$i/2],$h,$T1\n";
157 } else {
158 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
160 } if ($SZ==4);
162 ########### SHA512
163 $Xload = sub {
164 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
165 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
167 $code.=<<___ if ($i==0);
168 ld [$inp+0],%l0
169 ld [$inp+4],%l1
170 ld [$inp+8],%l2
171 ld [$inp+12],%l3
172 ld [$inp+16],%l4
173 ld [$inp+20],%l5
174 ld [$inp+24],%l6
175 cmp $tmp31,0
176 ld [$inp+28],%l7
178 $code.=<<___ if ($i<15);
179 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
180 add $tmp31,32,$tmp0
181 sllx @pair[0],$tmp0,$tmp1
182 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
183 srlx @pair[2],$tmp32,@pair[1]
184 or $tmp1,$tmp2,$tmp2
185 or @pair[1],$tmp2,$tmp2
186 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
187 add $h,$tmp2,$T1
188 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
190 $code.=<<___ if ($i==12);
191 bnz,a,pn %icc,.+8
192 ld [$inp+128],%l0
194 $code.=<<___ if ($i==15);
195 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
196 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
197 add $tmp31,32,$tmp0
198 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
199 sllx @pair[0],$tmp0,$tmp1
200 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
201 srlx @pair[2],$tmp32,@pair[1]
202 or $tmp1,$tmp2,$tmp2
203 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
204 or @pair[1],$tmp2,$tmp2
205 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
206 add $h,$tmp2,$T1
207 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
208 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
209 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
210 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
212 } if ($SZ==8);
214 ########### common
215 sub BODY_00_15 {
216 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
218 if ($i<16) {
219 &$Xload(@_);
220 } else {
221 $code.="\tadd $h,$T1,$T1\n";
224 $code.=<<___;
225 $SRL $e,@Sigma1[0],$h !! $i
226 xor $f,$g,$tmp2
227 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
228 and $e,$tmp2,$tmp2
229 $SRL $e,@Sigma1[1],$tmp0
230 xor $tmp1,$h,$h
231 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
232 xor $tmp0,$h,$h
233 $SRL $e,@Sigma1[2],$tmp0
234 xor $tmp1,$h,$h
235 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
236 xor $tmp0,$h,$h
237 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
238 xor $tmp1,$h,$tmp0 ! Sigma1(e)
240 $SRL $a,@Sigma0[0],$h
241 add $tmp2,$T1,$T1
242 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
243 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
244 add $tmp0,$T1,$T1
245 $SRL $a,@Sigma0[1],$tmp0
246 xor $tmp1,$h,$h
247 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
248 xor $tmp0,$h,$h
249 $SRL $a,@Sigma0[2],$tmp0
250 xor $tmp1,$h,$h
251 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
252 xor $tmp0,$h,$h
253 xor $tmp1,$h,$h ! Sigma0(a)
255 or $a,$b,$tmp0
256 and $a,$b,$tmp1
257 and $c,$tmp0,$tmp0
258 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
259 add $tmp2,$T1,$T1 ! +=K[$i]
260 add $tmp1,$h,$h
262 add $T1,$d,$d
263 add $T1,$h,$h
267 ########### SHA256
268 $BODY_16_XX = sub {
269 my $i=@_[0];
270 my $xi;
272 if ($i&1) {
273 $xi=$tmp32;
274 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
275 } else {
276 $xi=@X[(($i+1)/2)%8];
278 $code.=<<___;
279 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
280 sll $xi,`32-@sigma0[2]`,$tmp1
281 srl $xi,@sigma0[1],$tmp0
282 xor $tmp1,$T1,$T1
283 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
284 xor $tmp0,$T1,$T1
285 srl $xi,@sigma0[2],$tmp0
286 xor $tmp1,$T1,$T1
288 if ($i&1) {
289 $xi=@X[(($i+14)/2)%8];
290 } else {
291 $xi=$tmp32;
292 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
294 $code.=<<___;
295 srl $xi,@sigma1[0],$tmp2
296 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
297 sll $xi,`32-@sigma1[2]`,$tmp1
298 srl $xi,@sigma1[1],$tmp0
299 xor $tmp1,$tmp2,$tmp2
300 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
301 xor $tmp0,$tmp2,$tmp2
302 srl $xi,@sigma1[2],$tmp0
303 xor $tmp1,$tmp2,$tmp2
305 if ($i&1) {
306 $xi=@X[($i/2)%8];
307 $code.=<<___;
308 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
309 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
310 srl @X[($i/2)%8],0,$tmp0
311 add $tmp2,$tmp1,$tmp1
312 add $xi,$T1,$T1 ! +=X[i]
313 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
314 add $tmp1,$T1,$T1
316 srl $T1,0,$T1
317 or $T1,@X[($i/2)%8],@X[($i/2)%8]
319 } else {
320 $xi=@X[(($i+9)/2)%8];
321 $code.=<<___;
322 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
323 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
324 add $xi,$T1,$T1 ! +=X[i+9]
325 add $tmp2,$tmp1,$tmp1
326 srl @X[($i/2)%8],0,@X[($i/2)%8]
327 add $tmp1,$T1,$T1
329 sllx $T1,32,$tmp0
330 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
333 &BODY_00_15(@_);
334 } if ($SZ==4);
336 ########### SHA512
337 $BODY_16_XX = sub {
338 my $i=@_[0];
339 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
341 $code.=<<___;
342 sllx %l2,32,$tmp0 !! Xupdate($i)
343 or %l3,$tmp0,$tmp0
345 srlx $tmp0,@sigma0[0],$T1
346 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
347 sllx $tmp0,`64-@sigma0[2]`,$tmp1
348 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
349 srlx $tmp0,@sigma0[1],$tmp0
350 xor $tmp1,$T1,$T1
351 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
352 xor $tmp0,$T1,$T1
353 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
354 xor $tmp1,$T1,$T1
355 sllx %l6,32,$tmp2
356 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
357 or %l7,$tmp2,$tmp2
359 srlx $tmp2,@sigma1[0],$tmp1
360 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
361 sllx $tmp2,`64-@sigma1[2]`,$tmp0
362 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
363 srlx $tmp2,@sigma1[1],$tmp2
364 xor $tmp0,$tmp1,$tmp1
365 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
366 xor $tmp2,$tmp1,$tmp1
367 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
368 xor $tmp0,$tmp1,$tmp1
369 sllx %l4,32,$tmp0
370 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
371 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
372 or %l5,$tmp0,$tmp0
373 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
375 sllx %l0,32,$tmp2
376 add $tmp1,$T1,$T1
377 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
378 or %l1,$tmp2,$tmp2
379 add $tmp0,$T1,$T1 ! +=X[$i+9]
380 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
381 add $tmp2,$T1,$T1 ! +=X[$i]
382 $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
384 &BODY_00_15(@_);
385 } if ($SZ==8);
387 $code.=<<___;
388 #include "sparc_arch.h"
390 #ifdef __arch64__
391 .register %g2,#scratch
392 .register %g3,#scratch
393 #endif
395 .section ".text",#alloc,#execinstr
397 .align 64
398 K${label}:
399 .type K${label},#object
401 if ($SZ==4) {
402 $code.=<<___;
403 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
404 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
405 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
406 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
407 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
408 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
409 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
410 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
411 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
412 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
413 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
414 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
415 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
416 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
417 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
418 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
420 } else {
421 $code.=<<___;
422 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
423 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
424 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
425 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
426 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
427 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
428 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
429 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
430 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
431 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
432 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
433 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
434 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
435 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
436 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
437 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
438 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
439 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
440 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
441 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
442 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
443 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
444 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
445 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
446 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
447 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
448 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
449 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
450 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
451 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
452 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
453 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
454 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
455 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
456 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
457 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
458 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
459 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
460 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
461 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
464 $code.=<<___;
465 .size K${label},.-K${label}
467 #ifdef __PIC__
468 SPARC_PIC_THUNK(%g1)
469 #endif
471 .globl sha${label}_block_data_order
472 .align 32
473 sha${label}_block_data_order:
474 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
475 ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
477 andcc %g1, CFR_SHA${label}, %g0
478 be .Lsoftware
481 $code.=<<___ if ($SZ==8); # SHA512
482 ldd [%o0 + 0x00], %f0 ! load context
483 ldd [%o0 + 0x08], %f2
484 ldd [%o0 + 0x10], %f4
485 ldd [%o0 + 0x18], %f6
486 ldd [%o0 + 0x20], %f8
487 ldd [%o0 + 0x28], %f10
488 andcc %o1, 0x7, %g0
489 ldd [%o0 + 0x30], %f12
490 bne,pn %icc, .Lhwunaligned
491 ldd [%o0 + 0x38], %f14
493 .Lhwaligned_loop:
494 ldd [%o1 + 0x00], %f16
495 ldd [%o1 + 0x08], %f18
496 ldd [%o1 + 0x10], %f20
497 ldd [%o1 + 0x18], %f22
498 ldd [%o1 + 0x20], %f24
499 ldd [%o1 + 0x28], %f26
500 ldd [%o1 + 0x30], %f28
501 ldd [%o1 + 0x38], %f30
502 ldd [%o1 + 0x40], %f32
503 ldd [%o1 + 0x48], %f34
504 ldd [%o1 + 0x50], %f36
505 ldd [%o1 + 0x58], %f38
506 ldd [%o1 + 0x60], %f40
507 ldd [%o1 + 0x68], %f42
508 ldd [%o1 + 0x70], %f44
509 subcc %o2, 1, %o2 ! done yet?
510 ldd [%o1 + 0x78], %f46
511 add %o1, 0x80, %o1
512 prefetch [%o1 + 63], 20
513 prefetch [%o1 + 64+63], 20
515 .word 0x81b02860 ! SHA512
517 bne,pt SIZE_T_CC, .Lhwaligned_loop
520 .Lhwfinish:
521 std %f0, [%o0 + 0x00] ! store context
522 std %f2, [%o0 + 0x08]
523 std %f4, [%o0 + 0x10]
524 std %f6, [%o0 + 0x18]
525 std %f8, [%o0 + 0x20]
526 std %f10, [%o0 + 0x28]
527 std %f12, [%o0 + 0x30]
528 retl
529 std %f14, [%o0 + 0x38]
531 .align 16
532 .Lhwunaligned:
533 alignaddr %o1, %g0, %o1
535 ldd [%o1 + 0x00], %f18
536 .Lhwunaligned_loop:
537 ldd [%o1 + 0x08], %f20
538 ldd [%o1 + 0x10], %f22
539 ldd [%o1 + 0x18], %f24
540 ldd [%o1 + 0x20], %f26
541 ldd [%o1 + 0x28], %f28
542 ldd [%o1 + 0x30], %f30
543 ldd [%o1 + 0x38], %f32
544 ldd [%o1 + 0x40], %f34
545 ldd [%o1 + 0x48], %f36
546 ldd [%o1 + 0x50], %f38
547 ldd [%o1 + 0x58], %f40
548 ldd [%o1 + 0x60], %f42
549 ldd [%o1 + 0x68], %f44
550 ldd [%o1 + 0x70], %f46
551 ldd [%o1 + 0x78], %f48
552 subcc %o2, 1, %o2 ! done yet?
553 ldd [%o1 + 0x80], %f50
554 add %o1, 0x80, %o1
555 prefetch [%o1 + 63], 20
556 prefetch [%o1 + 64+63], 20
558 faligndata %f18, %f20, %f16
559 faligndata %f20, %f22, %f18
560 faligndata %f22, %f24, %f20
561 faligndata %f24, %f26, %f22
562 faligndata %f26, %f28, %f24
563 faligndata %f28, %f30, %f26
564 faligndata %f30, %f32, %f28
565 faligndata %f32, %f34, %f30
566 faligndata %f34, %f36, %f32
567 faligndata %f36, %f38, %f34
568 faligndata %f38, %f40, %f36
569 faligndata %f40, %f42, %f38
570 faligndata %f42, %f44, %f40
571 faligndata %f44, %f46, %f42
572 faligndata %f46, %f48, %f44
573 faligndata %f48, %f50, %f46
575 .word 0x81b02860 ! SHA512
577 bne,pt SIZE_T_CC, .Lhwunaligned_loop
578 for %f50, %f50, %f18 ! %f18=%f50
580 ba .Lhwfinish
583 $code.=<<___ if ($SZ==4); # SHA256
584 ld [%o0 + 0x00], %f0
585 ld [%o0 + 0x04], %f1
586 ld [%o0 + 0x08], %f2
587 ld [%o0 + 0x0c], %f3
588 ld [%o0 + 0x10], %f4
589 ld [%o0 + 0x14], %f5
590 andcc %o1, 0x7, %g0
591 ld [%o0 + 0x18], %f6
592 bne,pn %icc, .Lhwunaligned
593 ld [%o0 + 0x1c], %f7
595 .Lhwloop:
596 ldd [%o1 + 0x00], %f8
597 ldd [%o1 + 0x08], %f10
598 ldd [%o1 + 0x10], %f12
599 ldd [%o1 + 0x18], %f14
600 ldd [%o1 + 0x20], %f16
601 ldd [%o1 + 0x28], %f18
602 ldd [%o1 + 0x30], %f20
603 subcc %o2, 1, %o2 ! done yet?
604 ldd [%o1 + 0x38], %f22
605 add %o1, 0x40, %o1
606 prefetch [%o1 + 63], 20
608 .word 0x81b02840 ! SHA256
610 bne,pt SIZE_T_CC, .Lhwloop
613 .Lhwfinish:
614 st %f0, [%o0 + 0x00] ! store context
615 st %f1, [%o0 + 0x04]
616 st %f2, [%o0 + 0x08]
617 st %f3, [%o0 + 0x0c]
618 st %f4, [%o0 + 0x10]
619 st %f5, [%o0 + 0x14]
620 st %f6, [%o0 + 0x18]
621 retl
622 st %f7, [%o0 + 0x1c]
624 .align 8
625 .Lhwunaligned:
626 alignaddr %o1, %g0, %o1
628 ldd [%o1 + 0x00], %f10
629 .Lhwunaligned_loop:
630 ldd [%o1 + 0x08], %f12
631 ldd [%o1 + 0x10], %f14
632 ldd [%o1 + 0x18], %f16
633 ldd [%o1 + 0x20], %f18
634 ldd [%o1 + 0x28], %f20
635 ldd [%o1 + 0x30], %f22
636 ldd [%o1 + 0x38], %f24
637 subcc %o2, 1, %o2 ! done yet?
638 ldd [%o1 + 0x40], %f26
639 add %o1, 0x40, %o1
640 prefetch [%o1 + 63], 20
642 faligndata %f10, %f12, %f8
643 faligndata %f12, %f14, %f10
644 faligndata %f14, %f16, %f12
645 faligndata %f16, %f18, %f14
646 faligndata %f18, %f20, %f16
647 faligndata %f20, %f22, %f18
648 faligndata %f22, %f24, %f20
649 faligndata %f24, %f26, %f22
651 .word 0x81b02840 ! SHA256
653 bne,pt SIZE_T_CC, .Lhwunaligned_loop
654 for %f26, %f26, %f10 ! %f10=%f26
656 ba .Lhwfinish
659 $code.=<<___;
660 .align 16
661 .Lsoftware:
662 save %sp,-STACK_FRAME-$locals,%sp
663 and $inp,`$align-1`,$tmp31
664 sllx $len,`log(16*$SZ)/log(2)`,$len
665 andn $inp,`$align-1`,$inp
666 sll $tmp31,3,$tmp31
667 add $inp,$len,$len
669 $code.=<<___ if ($SZ==8); # SHA512
670 mov 32,$tmp32
671 sub $tmp32,$tmp31,$tmp32
673 $code.=<<___;
674 .Lpic: call .+8
675 add %o7,K${label}-.Lpic,$Ktbl
677 $LD [$ctx+`0*$SZ`],$A
678 $LD [$ctx+`1*$SZ`],$B
679 $LD [$ctx+`2*$SZ`],$C
680 $LD [$ctx+`3*$SZ`],$D
681 $LD [$ctx+`4*$SZ`],$E
682 $LD [$ctx+`5*$SZ`],$F
683 $LD [$ctx+`6*$SZ`],$G
684 $LD [$ctx+`7*$SZ`],$H
686 .Lloop:
688 for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
689 $code.=".L16_xx:\n";
690 for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
691 $code.=<<___;
692 and $tmp2,0xfff,$tmp2
693 cmp $tmp2,$lastK
694 bne .L16_xx
695 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
698 $code.=<<___ if ($SZ==4); # SHA256
699 $LD [$ctx+`0*$SZ`],@X[0]
700 $LD [$ctx+`1*$SZ`],@X[1]
701 $LD [$ctx+`2*$SZ`],@X[2]
702 $LD [$ctx+`3*$SZ`],@X[3]
703 $LD [$ctx+`4*$SZ`],@X[4]
704 $LD [$ctx+`5*$SZ`],@X[5]
705 $LD [$ctx+`6*$SZ`],@X[6]
706 $LD [$ctx+`7*$SZ`],@X[7]
708 add $A,@X[0],$A
709 $ST $A,[$ctx+`0*$SZ`]
710 add $B,@X[1],$B
711 $ST $B,[$ctx+`1*$SZ`]
712 add $C,@X[2],$C
713 $ST $C,[$ctx+`2*$SZ`]
714 add $D,@X[3],$D
715 $ST $D,[$ctx+`3*$SZ`]
716 add $E,@X[4],$E
717 $ST $E,[$ctx+`4*$SZ`]
718 add $F,@X[5],$F
719 $ST $F,[$ctx+`5*$SZ`]
720 add $G,@X[6],$G
721 $ST $G,[$ctx+`6*$SZ`]
722 add $H,@X[7],$H
723 $ST $H,[$ctx+`7*$SZ`]
725 $code.=<<___ if ($SZ==8); # SHA512
726 ld [$ctx+`0*$SZ+0`],%l0
727 ld [$ctx+`0*$SZ+4`],%l1
728 ld [$ctx+`1*$SZ+0`],%l2
729 ld [$ctx+`1*$SZ+4`],%l3
730 ld [$ctx+`2*$SZ+0`],%l4
731 ld [$ctx+`2*$SZ+4`],%l5
732 ld [$ctx+`3*$SZ+0`],%l6
734 sllx %l0,32,$tmp0
735 ld [$ctx+`3*$SZ+4`],%l7
736 sllx %l2,32,$tmp1
737 or %l1,$tmp0,$tmp0
738 or %l3,$tmp1,$tmp1
739 add $tmp0,$A,$A
740 add $tmp1,$B,$B
741 $ST $A,[$ctx+`0*$SZ`]
742 sllx %l4,32,$tmp2
743 $ST $B,[$ctx+`1*$SZ`]
744 sllx %l6,32,$T1
745 or %l5,$tmp2,$tmp2
746 or %l7,$T1,$T1
747 add $tmp2,$C,$C
748 $ST $C,[$ctx+`2*$SZ`]
749 add $T1,$D,$D
750 $ST $D,[$ctx+`3*$SZ`]
752 ld [$ctx+`4*$SZ+0`],%l0
753 ld [$ctx+`4*$SZ+4`],%l1
754 ld [$ctx+`5*$SZ+0`],%l2
755 ld [$ctx+`5*$SZ+4`],%l3
756 ld [$ctx+`6*$SZ+0`],%l4
757 ld [$ctx+`6*$SZ+4`],%l5
758 ld [$ctx+`7*$SZ+0`],%l6
760 sllx %l0,32,$tmp0
761 ld [$ctx+`7*$SZ+4`],%l7
762 sllx %l2,32,$tmp1
763 or %l1,$tmp0,$tmp0
764 or %l3,$tmp1,$tmp1
765 add $tmp0,$E,$E
766 add $tmp1,$F,$F
767 $ST $E,[$ctx+`4*$SZ`]
768 sllx %l4,32,$tmp2
769 $ST $F,[$ctx+`5*$SZ`]
770 sllx %l6,32,$T1
771 or %l5,$tmp2,$tmp2
772 or %l7,$T1,$T1
773 add $tmp2,$G,$G
774 $ST $G,[$ctx+`6*$SZ`]
775 add $T1,$H,$H
776 $ST $H,[$ctx+`7*$SZ`]
778 $code.=<<___;
779 add $inp,`16*$SZ`,$inp ! advance inp
780 cmp $inp,$len
781 bne SIZE_T_CC,.Lloop
782 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
785 restore
786 .type sha${label}_block_data_order,#function
787 .size sha${label}_block_data_order,(.-sha${label}_block_data_order)
788 .asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
789 .align 4
792 # Purpose of these subroutines is to explicitly encode VIS instructions,
793 # so that one can compile the module without having to specify VIS
794 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
795 # Idea is to reserve for option to produce "universal" binary and let
796 # programmer detect if current CPU is VIS capable at run-time.
797 sub unvis {
798 my ($mnemonic,$rs1,$rs2,$rd)=@_;
799 my $ref,$opf;
800 my %visopf = ( "faligndata" => 0x048,
801 "for" => 0x07c );
803 $ref = "$mnemonic\t$rs1,$rs2,$rd";
805 if ($opf=$visopf{$mnemonic}) {
806 foreach ($rs1,$rs2,$rd) {
807 return $ref if (!/%f([0-9]{1,2})/);
808 $_=$1;
809 if ($1>=32) {
810 return $ref if ($1&1);
811 # re-encode for upper double register addressing
812 $_=($1|$1>>5)&31;
816 return sprintf ".word\t0x%08x !%s",
817 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
818 $ref;
819 } else {
820 return $ref;
823 sub unalignaddr {
824 my ($mnemonic,$rs1,$rs2,$rd)=@_;
825 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
826 my $ref="$mnemonic\t$rs1,$rs2,$rd";
828 foreach ($rs1,$rs2,$rd) {
829 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
830 else { return $ref; }
832 return sprintf ".word\t0x%08x !%s",
833 0x81b00300|$rd<<25|$rs1<<14|$rs2,
834 $ref;
837 foreach (split("\n",$code)) {
838 s/\`([^\`]*)\`/eval $1/ge;
840 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
841 &unvis($1,$2,$3,$4)
842 /ge;
843 s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
844 &unalignaddr($1,$2,$3,$4)
845 /ge;
847 print $_,"\n";
850 close STDOUT;