release/src/router/openssl/crypto/sha/asm/sha512-sparcv9.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 #
   9 # Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
  10 # ====================================================================
  11
  12 # SHA256 performance improvement over compiler generated code varies
  13 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
  14 # build]. Just like in SHA1 module I aim to ensure scalability on
  15 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
  16
  17 # SHA512 on pre-T1 UltraSPARC.
  18 #
  19 # Performance is >75% better than 64-bit code generated by Sun C and
  20 # over 2x than 32-bit code. X[16] resides on stack, but access to it
  21 # is scheduled for L2 latency and staged through 32 least significant
  22 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
  23 # duality. Nevetheless it's ~40% faster than SHA256, which is pretty
  24 # good [optimal coefficient is 50%].
  25 #
  26 # SHA512 on UltraSPARC T1.
  27 #
  28 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
  29 # because 64-bit code generator has the advantage of using 64-bit
  30 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
  31 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
  32 # code by 60%, not to mention that it doesn't suffer from severe decay
  33 # when running 4 times physical cores threads and that it leaves gcc
  34 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
  35 # performance is only 10% better, but overall throughput for maximum
  36 # amount of threads for given CPU exceeds corresponding one of SHA256
  37 # by 30% [again, optimal coefficient is 50%].
  38 #
  39 # (*)   Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
  40 #       in-order, i.e. load instruction has to complete prior next
  41 #       instruction in given thread is executed, even if the latter is
  42 #       not dependent on load result! This means that on T1 two 32-bit
  43 #       loads are always slower than one 64-bit load. Once again this
  44 #       is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
  45 #       2x32-bit loads can be as fast as 1x64-bit ones.
  46 #
  47 # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
  48 # which is 9.3x/11.1x faster than software. Multi-process benchmark
  49 # saturates at 11.5x single-process result on 8-core processor, or
  50 # ~11/16GBps per 2.85GHz socket.
  51
  52 $output=shift;
  53 open STDOUT,">$output";
  54
  55 if ($output =~ /512/) {
  56         $label="512";
  57         $SZ=8;
  58         $LD="ldx";              # load from memory
  59         $ST="stx";              # store to memory
  60         $SLL="sllx";            # shift left logical
  61         $SRL="srlx";            # shift right logical
  62         @Sigma0=(28,34,39);
  63         @Sigma1=(14,18,41);
  64         @sigma0=( 7, 1, 8);     # right shift first
  65         @sigma1=( 6,19,61);     # right shift first
  66         $lastK=0x817;
  67         $rounds=80;
  68         $align=4;
  69
  70         $locals=16*$SZ;         # X[16]
  71
  72         $A="%o0";
  73         $B="%o1";
  74         $C="%o2";
  75         $D="%o3";
  76         $E="%o4";
  77         $F="%o5";
  78         $G="%g1";
  79         $H="%o7";
  80         @V=($A,$B,$C,$D,$E,$F,$G,$H);
  81 } else {
  82         $label="256";
  83         $SZ=4;
  84         $LD="ld";               # load from memory
  85         $ST="st";               # store to memory
  86         $SLL="sll";             # shift left logical
  87         $SRL="srl";             # shift right logical
  88         @Sigma0=( 2,13,22);
  89         @Sigma1=( 6,11,25);
  90         @sigma0=( 3, 7,18);     # right shift first
  91         @sigma1=(10,17,19);     # right shift first
  92         $lastK=0x8f2;
  93         $rounds=64;
  94         $align=8;
  95
  96         $locals=0;              # X[16] is register resident
  97         @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
  98
  99         $A="%l0";
 100         $B="%l1";
 101         $C="%l2";
 102         $D="%l3";
 103         $E="%l4";
 104         $F="%l5";
 105         $G="%l6";
 106         $H="%l7";
 107         @V=($A,$B,$C,$D,$E,$F,$G,$H);
 108 }
 109 $T1="%g2";
 110 $tmp0="%g3";
 111 $tmp1="%g4";
 112 $tmp2="%g5";
 113
 114 $ctx="%i0";
 115 $inp="%i1";
 116 $len="%i2";
 117 $Ktbl="%i3";
 118 $tmp31="%i4";
 119 $tmp32="%i5";
 120
 121 ########### SHA256
 122 $Xload = sub {
 123 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 124
 125     if ($i==0) {
 126 $code.=<<___;
 127         ldx     [$inp+0],@X[0]
 128         ldx     [$inp+16],@X[2]
 129         ldx     [$inp+32],@X[4]
 130         ldx     [$inp+48],@X[6]
 131         ldx     [$inp+8],@X[1]
 132         ldx     [$inp+24],@X[3]
 133         subcc   %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
 134         ldx     [$inp+40],@X[5]
 135         bz,pt   %icc,.Laligned
 136         ldx     [$inp+56],@X[7]
 137
 138         sllx    @X[0],$tmp31,@X[0]
 139         ldx     [$inp+64],$T1
 140 ___
 141 for($j=0;$j<7;$j++)
 142 {   $code.=<<___;
 143         srlx    @X[$j+1],$tmp32,$tmp1
 144         sllx    @X[$j+1],$tmp31,@X[$j+1]
 145         or      $tmp1,@X[$j],@X[$j]
 146 ___
 147 }
 148 $code.=<<___;
 149         srlx    $T1,$tmp32,$T1
 150         or      $T1,@X[7],@X[7]
 151 .Laligned:
 152 ___
 153     }
 154
 155     if ($i&1) {
 156         $code.="\tadd   @X[$i/2],$h,$T1\n";
 157     } else {
 158         $code.="\tsrlx  @X[$i/2],32,$T1\n\tadd  $h,$T1,$T1\n";
 159     }
 160 } if ($SZ==4);
 161
 162 ########### SHA512
 163 $Xload = sub {
 164 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 165 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
 166
 167 $code.=<<___ if ($i==0);
 168         ld      [$inp+0],%l0
 169         ld      [$inp+4],%l1
 170         ld      [$inp+8],%l2
 171         ld      [$inp+12],%l3
 172         ld      [$inp+16],%l4
 173         ld      [$inp+20],%l5
 174         ld      [$inp+24],%l6
 175         cmp     $tmp31,0
 176         ld      [$inp+28],%l7
 177 ___
 178 $code.=<<___ if ($i<15);
 179         sllx    @pair[1],$tmp31,$tmp2   ! Xload($i)
 180         add     $tmp31,32,$tmp0
 181         sllx    @pair[0],$tmp0,$tmp1
 182         `"ld    [$inp+".eval(32+0+$i*8)."],@pair[0]"    if ($i<12)`
 183         srlx    @pair[2],$tmp32,@pair[1]
 184         or      $tmp1,$tmp2,$tmp2
 185         or      @pair[1],$tmp2,$tmp2
 186         `"ld    [$inp+".eval(32+4+$i*8)."],@pair[1]"    if ($i<12)`
 187         add     $h,$tmp2,$T1
 188         $ST     $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
 189 ___
 190 $code.=<<___ if ($i==12);
 191         bnz,a,pn        %icc,.+8
 192         ld      [$inp+128],%l0
 193 ___
 194 $code.=<<___ if ($i==15);
 195         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
 196         sllx    @pair[1],$tmp31,$tmp2   ! Xload($i)
 197         add     $tmp31,32,$tmp0
 198         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
 199         sllx    @pair[0],$tmp0,$tmp1
 200         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
 201         srlx    @pair[2],$tmp32,@pair[1]
 202         or      $tmp1,$tmp2,$tmp2
 203         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
 204         or      @pair[1],$tmp2,$tmp2
 205         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
 206         add     $h,$tmp2,$T1
 207         $ST     $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
 208         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
 209         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
 210         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
 211 ___
 212 } if ($SZ==8);
 213
 214 ########### common
 215 sub BODY_00_15 {
 216 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 217
 218     if ($i<16) {
 219         &$Xload(@_);
 220     } else {
 221         $code.="\tadd   $h,$T1,$T1\n";
 222     }
 223
 224 $code.=<<___;
 225         $SRL    $e,@Sigma1[0],$h        !! $i
 226         xor     $f,$g,$tmp2
 227         $SLL    $e,`$SZ*8-@Sigma1[2]`,$tmp1
 228         and     $e,$tmp2,$tmp2
 229         $SRL    $e,@Sigma1[1],$tmp0
 230         xor     $tmp1,$h,$h
 231         $SLL    $e,`$SZ*8-@Sigma1[1]`,$tmp1
 232         xor     $tmp0,$h,$h
 233         $SRL    $e,@Sigma1[2],$tmp0
 234         xor     $tmp1,$h,$h
 235         $SLL    $e,`$SZ*8-@Sigma1[0]`,$tmp1
 236         xor     $tmp0,$h,$h
 237         xor     $g,$tmp2,$tmp2          ! Ch(e,f,g)
 238         xor     $tmp1,$h,$tmp0          ! Sigma1(e)
 239
 240         $SRL    $a,@Sigma0[0],$h
 241         add     $tmp2,$T1,$T1
 242         $LD     [$Ktbl+`$i*$SZ`],$tmp2  ! K[$i]
 243         $SLL    $a,`$SZ*8-@Sigma0[2]`,$tmp1
 244         add     $tmp0,$T1,$T1
 245         $SRL    $a,@Sigma0[1],$tmp0
 246         xor     $tmp1,$h,$h
 247         $SLL    $a,`$SZ*8-@Sigma0[1]`,$tmp1
 248         xor     $tmp0,$h,$h
 249         $SRL    $a,@Sigma0[2],$tmp0
 250         xor     $tmp1,$h,$h
 251         $SLL    $a,`$SZ*8-@Sigma0[0]`,$tmp1
 252         xor     $tmp0,$h,$h
 253         xor     $tmp1,$h,$h             ! Sigma0(a)
 254
 255         or      $a,$b,$tmp0
 256         and     $a,$b,$tmp1
 257         and     $c,$tmp0,$tmp0
 258         or      $tmp0,$tmp1,$tmp1       ! Maj(a,b,c)
 259         add     $tmp2,$T1,$T1           ! +=K[$i]
 260         add     $tmp1,$h,$h
 261
 262         add     $T1,$d,$d
 263         add     $T1,$h,$h
 264 ___
 265 }
 266
 267 ########### SHA256
 268 $BODY_16_XX = sub {
 269 my $i=@_[0];
 270 my $xi;
 271
 272     if ($i&1) {
 273         $xi=$tmp32;
 274         $code.="\tsrlx  @X[(($i+1)/2)%8],32,$xi\n";
 275     } else {
 276         $xi=@X[(($i+1)/2)%8];
 277     }
 278 $code.=<<___;
 279         srl     $xi,@sigma0[0],$T1              !! Xupdate($i)
 280         sll     $xi,`32-@sigma0[2]`,$tmp1
 281         srl     $xi,@sigma0[1],$tmp0
 282         xor     $tmp1,$T1,$T1
 283         sll     $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
 284         xor     $tmp0,$T1,$T1
 285         srl     $xi,@sigma0[2],$tmp0
 286         xor     $tmp1,$T1,$T1
 287 ___
 288     if ($i&1) {
 289         $xi=@X[(($i+14)/2)%8];
 290     } else {
 291         $xi=$tmp32;
 292         $code.="\tsrlx  @X[(($i+14)/2)%8],32,$xi\n";
 293     }
 294 $code.=<<___;
 295         srl     $xi,@sigma1[0],$tmp2
 296         xor     $tmp0,$T1,$T1                   ! T1=sigma0(X[i+1])
 297         sll     $xi,`32-@sigma1[2]`,$tmp1
 298         srl     $xi,@sigma1[1],$tmp0
 299         xor     $tmp1,$tmp2,$tmp2
 300         sll     $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
 301         xor     $tmp0,$tmp2,$tmp2
 302         srl     $xi,@sigma1[2],$tmp0
 303         xor     $tmp1,$tmp2,$tmp2
 304 ___
 305     if ($i&1) {
 306         $xi=@X[($i/2)%8];
 307 $code.=<<___;
 308         srlx    @X[(($i+9)/2)%8],32,$tmp1       ! X[i+9]
 309         xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
 310         srl     @X[($i/2)%8],0,$tmp0
 311         add     $tmp2,$tmp1,$tmp1
 312         add     $xi,$T1,$T1                     ! +=X[i]
 313         xor     $tmp0,@X[($i/2)%8],@X[($i/2)%8]
 314         add     $tmp1,$T1,$T1
 315
 316         srl     $T1,0,$T1
 317         or      $T1,@X[($i/2)%8],@X[($i/2)%8]
 318 ___
 319     } else {
 320         $xi=@X[(($i+9)/2)%8];
 321 $code.=<<___;
 322         srlx    @X[($i/2)%8],32,$tmp1           ! X[i]
 323         xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
 324         add     $xi,$T1,$T1                     ! +=X[i+9]
 325         add     $tmp2,$tmp1,$tmp1
 326         srl     @X[($i/2)%8],0,@X[($i/2)%8]
 327         add     $tmp1,$T1,$T1
 328
 329         sllx    $T1,32,$tmp0
 330         or      $tmp0,@X[($i/2)%8],@X[($i/2)%8]
 331 ___
 332     }
 333     &BODY_00_15(@_);
 334 } if ($SZ==4);
 335
 336 ########### SHA512
 337 $BODY_16_XX = sub {
 338 my $i=@_[0];
 339 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
 340
 341 $code.=<<___;
 342         sllx    %l2,32,$tmp0            !! Xupdate($i)
 343         or      %l3,$tmp0,$tmp0
 344
 345         srlx    $tmp0,@sigma0[0],$T1
 346         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
 347         sllx    $tmp0,`64-@sigma0[2]`,$tmp1
 348         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
 349         srlx    $tmp0,@sigma0[1],$tmp0
 350         xor     $tmp1,$T1,$T1
 351         sllx    $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
 352         xor     $tmp0,$T1,$T1
 353         srlx    $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
 354         xor     $tmp1,$T1,$T1
 355         sllx    %l6,32,$tmp2
 356         xor     $tmp0,$T1,$T1           ! sigma0(X[$i+1])
 357         or      %l7,$tmp2,$tmp2
 358
 359         srlx    $tmp2,@sigma1[0],$tmp1
 360         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
 361         sllx    $tmp2,`64-@sigma1[2]`,$tmp0
 362         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
 363         srlx    $tmp2,@sigma1[1],$tmp2
 364         xor     $tmp0,$tmp1,$tmp1
 365         sllx    $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
 366         xor     $tmp2,$tmp1,$tmp1
 367         srlx    $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
 368         xor     $tmp0,$tmp1,$tmp1
 369         sllx    %l4,32,$tmp0
 370         xor     $tmp2,$tmp1,$tmp1       ! sigma1(X[$i+14])
 371         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
 372         or      %l5,$tmp0,$tmp0
 373         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
 374
 375         sllx    %l0,32,$tmp2
 376         add     $tmp1,$T1,$T1
 377         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
 378         or      %l1,$tmp2,$tmp2
 379         add     $tmp0,$T1,$T1           ! +=X[$i+9]
 380         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
 381         add     $tmp2,$T1,$T1           ! +=X[$i]
 382         $ST     $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
 383 ___
 384     &BODY_00_15(@_);
 385 } if ($SZ==8);
 386
 387 $code.=<<___;
 388 #include "sparc_arch.h"
 389
 390 #ifdef __arch64__
 391 .register       %g2,#scratch
 392 .register       %g3,#scratch
 393 #endif
 394
 395 .section        ".text",#alloc,#execinstr
 396
 397 .align  64
 398 K${label}:
 399 .type   K${label},#object
 400 ___
 401 if ($SZ==4) {
 402 $code.=<<___;
 403         .long   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
 404         .long   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
 405         .long   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
 406         .long   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
 407         .long   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
 408         .long   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
 409         .long   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
 410         .long   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
 411         .long   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
 412         .long   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
 413         .long   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
 414         .long   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
 415         .long   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
 416         .long   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
 417         .long   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
 418         .long   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 419 ___
 420 } else {
 421 $code.=<<___;
 422         .long   0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
 423         .long   0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
 424         .long   0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
 425         .long   0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
 426         .long   0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
 427         .long   0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
 428         .long   0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
 429         .long   0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
 430         .long   0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
 431         .long   0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
 432         .long   0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
 433         .long   0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
 434         .long   0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
 435         .long   0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
 436         .long   0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
 437         .long   0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
 438         .long   0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
 439         .long   0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
 440         .long   0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
 441         .long   0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
 442         .long   0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
 443         .long   0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
 444         .long   0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
 445         .long   0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
 446         .long   0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
 447         .long   0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
 448         .long   0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
 449         .long   0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
 450         .long   0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
 451         .long   0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
 452         .long   0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
 453         .long   0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
 454         .long   0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
 455         .long   0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
 456         .long   0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
 457         .long   0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
 458         .long   0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
 459         .long   0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
 460         .long   0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
 461         .long   0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
 462 ___
 463 }
 464 $code.=<<___;
 465 .size   K${label},.-K${label}
 466
 467 #ifdef __PIC__
 468 SPARC_PIC_THUNK(%g1)
 469 #endif
 470
 471 .globl  sha${label}_block_data_order
 472 .align  32
 473 sha${label}_block_data_order:
 474         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
 475         ld      [%g1+4],%g1             ! OPENSSL_sparcv9cap_P[1]
 476
 477         andcc   %g1, CFR_SHA${label}, %g0
 478         be      .Lsoftware
 479         nop
 480 ___
 481 $code.=<<___ if ($SZ==8);               # SHA512
 482         ldd     [%o0 + 0x00], %f0       ! load context
 483         ldd     [%o0 + 0x08], %f2
 484         ldd     [%o0 + 0x10], %f4
 485         ldd     [%o0 + 0x18], %f6
 486         ldd     [%o0 + 0x20], %f8
 487         ldd     [%o0 + 0x28], %f10
 488         andcc   %o1, 0x7, %g0
 489         ldd     [%o0 + 0x30], %f12
 490         bne,pn  %icc, .Lhwunaligned
 491          ldd    [%o0 + 0x38], %f14
 492
 493 .Lhwaligned_loop:
 494         ldd     [%o1 + 0x00], %f16
 495         ldd     [%o1 + 0x08], %f18
 496         ldd     [%o1 + 0x10], %f20
 497         ldd     [%o1 + 0x18], %f22
 498         ldd     [%o1 + 0x20], %f24
 499         ldd     [%o1 + 0x28], %f26
 500         ldd     [%o1 + 0x30], %f28
 501         ldd     [%o1 + 0x38], %f30
 502         ldd     [%o1 + 0x40], %f32
 503         ldd     [%o1 + 0x48], %f34
 504         ldd     [%o1 + 0x50], %f36
 505         ldd     [%o1 + 0x58], %f38
 506         ldd     [%o1 + 0x60], %f40
 507         ldd     [%o1 + 0x68], %f42
 508         ldd     [%o1 + 0x70], %f44
 509         subcc   %o2, 1, %o2             ! done yet?
 510         ldd     [%o1 + 0x78], %f46
 511         add     %o1, 0x80, %o1
 512         prefetch [%o1 + 63], 20
 513         prefetch [%o1 + 64+63], 20
 514
 515         .word   0x81b02860              ! SHA512
 516
 517         bne,pt  SIZE_T_CC, .Lhwaligned_loop
 518         nop
 519
 520 .Lhwfinish:
 521         std     %f0, [%o0 + 0x00]       ! store context
 522         std     %f2, [%o0 + 0x08]
 523         std     %f4, [%o0 + 0x10]
 524         std     %f6, [%o0 + 0x18]
 525         std     %f8, [%o0 + 0x20]
 526         std     %f10, [%o0 + 0x28]
 527         std     %f12, [%o0 + 0x30]
 528         retl
 529          std    %f14, [%o0 + 0x38]
 530
 531 .align  16
 532 .Lhwunaligned:
 533         alignaddr %o1, %g0, %o1
 534
 535         ldd     [%o1 + 0x00], %f18
 536 .Lhwunaligned_loop:
 537         ldd     [%o1 + 0x08], %f20
 538         ldd     [%o1 + 0x10], %f22
 539         ldd     [%o1 + 0x18], %f24
 540         ldd     [%o1 + 0x20], %f26
 541         ldd     [%o1 + 0x28], %f28
 542         ldd     [%o1 + 0x30], %f30
 543         ldd     [%o1 + 0x38], %f32
 544         ldd     [%o1 + 0x40], %f34
 545         ldd     [%o1 + 0x48], %f36
 546         ldd     [%o1 + 0x50], %f38
 547         ldd     [%o1 + 0x58], %f40
 548         ldd     [%o1 + 0x60], %f42
 549         ldd     [%o1 + 0x68], %f44
 550         ldd     [%o1 + 0x70], %f46
 551         ldd     [%o1 + 0x78], %f48
 552         subcc   %o2, 1, %o2             ! done yet?
 553         ldd     [%o1 + 0x80], %f50
 554         add     %o1, 0x80, %o1
 555         prefetch [%o1 + 63], 20
 556         prefetch [%o1 + 64+63], 20
 557
 558         faligndata %f18, %f20, %f16
 559         faligndata %f20, %f22, %f18
 560         faligndata %f22, %f24, %f20
 561         faligndata %f24, %f26, %f22
 562         faligndata %f26, %f28, %f24
 563         faligndata %f28, %f30, %f26
 564         faligndata %f30, %f32, %f28
 565         faligndata %f32, %f34, %f30
 566         faligndata %f34, %f36, %f32
 567         faligndata %f36, %f38, %f34
 568         faligndata %f38, %f40, %f36
 569         faligndata %f40, %f42, %f38
 570         faligndata %f42, %f44, %f40
 571         faligndata %f44, %f46, %f42
 572         faligndata %f46, %f48, %f44
 573         faligndata %f48, %f50, %f46
 574
 575         .word   0x81b02860              ! SHA512
 576
 577         bne,pt  SIZE_T_CC, .Lhwunaligned_loop
 578         for     %f50, %f50, %f18        ! %f18=%f50
 579
 580         ba      .Lhwfinish
 581         nop
 582 ___
 583 $code.=<<___ if ($SZ==4);               # SHA256
 584         ld      [%o0 + 0x00], %f0
 585         ld      [%o0 + 0x04], %f1
 586         ld      [%o0 + 0x08], %f2
 587         ld      [%o0 + 0x0c], %f3
 588         ld      [%o0 + 0x10], %f4
 589         ld      [%o0 + 0x14], %f5
 590         andcc   %o1, 0x7, %g0
 591         ld      [%o0 + 0x18], %f6
 592         bne,pn  %icc, .Lhwunaligned
 593          ld     [%o0 + 0x1c], %f7
 594
 595 .Lhwloop:
 596         ldd     [%o1 + 0x00], %f8
 597         ldd     [%o1 + 0x08], %f10
 598         ldd     [%o1 + 0x10], %f12
 599         ldd     [%o1 + 0x18], %f14
 600         ldd     [%o1 + 0x20], %f16
 601         ldd     [%o1 + 0x28], %f18
 602         ldd     [%o1 + 0x30], %f20
 603         subcc   %o2, 1, %o2             ! done yet?
 604         ldd     [%o1 + 0x38], %f22
 605         add     %o1, 0x40, %o1
 606         prefetch [%o1 + 63], 20
 607
 608         .word   0x81b02840              ! SHA256
 609
 610         bne,pt  SIZE_T_CC, .Lhwloop
 611         nop
 612
 613 .Lhwfinish:
 614         st      %f0, [%o0 + 0x00]       ! store context
 615         st      %f1, [%o0 + 0x04]
 616         st      %f2, [%o0 + 0x08]
 617         st      %f3, [%o0 + 0x0c]
 618         st      %f4, [%o0 + 0x10]
 619         st      %f5, [%o0 + 0x14]
 620         st      %f6, [%o0 + 0x18]
 621         retl
 622          st     %f7, [%o0 + 0x1c]
 623
 624 .align  8
 625 .Lhwunaligned:
 626         alignaddr %o1, %g0, %o1
 627
 628         ldd     [%o1 + 0x00], %f10
 629 .Lhwunaligned_loop:
 630         ldd     [%o1 + 0x08], %f12
 631         ldd     [%o1 + 0x10], %f14
 632         ldd     [%o1 + 0x18], %f16
 633         ldd     [%o1 + 0x20], %f18
 634         ldd     [%o1 + 0x28], %f20
 635         ldd     [%o1 + 0x30], %f22
 636         ldd     [%o1 + 0x38], %f24
 637         subcc   %o2, 1, %o2             ! done yet?
 638         ldd     [%o1 + 0x40], %f26
 639         add     %o1, 0x40, %o1
 640         prefetch [%o1 + 63], 20
 641
 642         faligndata %f10, %f12, %f8
 643         faligndata %f12, %f14, %f10
 644         faligndata %f14, %f16, %f12
 645         faligndata %f16, %f18, %f14
 646         faligndata %f18, %f20, %f16
 647         faligndata %f20, %f22, %f18
 648         faligndata %f22, %f24, %f20
 649         faligndata %f24, %f26, %f22
 650
 651         .word   0x81b02840              ! SHA256
 652
 653         bne,pt  SIZE_T_CC, .Lhwunaligned_loop
 654         for     %f26, %f26, %f10        ! %f10=%f26
 655
 656         ba      .Lhwfinish
 657         nop
 658 ___
 659 $code.=<<___;
 660 .align  16
 661 .Lsoftware:
 662         save    %sp,-STACK_FRAME-$locals,%sp
 663         and     $inp,`$align-1`,$tmp31
 664         sllx    $len,`log(16*$SZ)/log(2)`,$len
 665         andn    $inp,`$align-1`,$inp
 666         sll     $tmp31,3,$tmp31
 667         add     $inp,$len,$len
 668 ___
 669 $code.=<<___ if ($SZ==8); # SHA512
 670         mov     32,$tmp32
 671         sub     $tmp32,$tmp31,$tmp32
 672 ___
 673 $code.=<<___;
 674 .Lpic:  call    .+8
 675         add     %o7,K${label}-.Lpic,$Ktbl
 676
 677         $LD     [$ctx+`0*$SZ`],$A
 678         $LD     [$ctx+`1*$SZ`],$B
 679         $LD     [$ctx+`2*$SZ`],$C
 680         $LD     [$ctx+`3*$SZ`],$D
 681         $LD     [$ctx+`4*$SZ`],$E
 682         $LD     [$ctx+`5*$SZ`],$F
 683         $LD     [$ctx+`6*$SZ`],$G
 684         $LD     [$ctx+`7*$SZ`],$H
 685
 686 .Lloop:
 687 ___
 688 for ($i=0;$i<16;$i++)   { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
 689 $code.=".L16_xx:\n";
 690 for (;$i<32;$i++)       { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
 691 $code.=<<___;
 692         and     $tmp2,0xfff,$tmp2
 693         cmp     $tmp2,$lastK
 694         bne     .L16_xx
 695         add     $Ktbl,`16*$SZ`,$Ktbl    ! Ktbl+=16
 696
 697 ___
 698 $code.=<<___ if ($SZ==4); # SHA256
 699         $LD     [$ctx+`0*$SZ`],@X[0]
 700         $LD     [$ctx+`1*$SZ`],@X[1]
 701         $LD     [$ctx+`2*$SZ`],@X[2]
 702         $LD     [$ctx+`3*$SZ`],@X[3]
 703         $LD     [$ctx+`4*$SZ`],@X[4]
 704         $LD     [$ctx+`5*$SZ`],@X[5]
 705         $LD     [$ctx+`6*$SZ`],@X[6]
 706         $LD     [$ctx+`7*$SZ`],@X[7]
 707
 708         add     $A,@X[0],$A
 709         $ST     $A,[$ctx+`0*$SZ`]
 710         add     $B,@X[1],$B
 711         $ST     $B,[$ctx+`1*$SZ`]
 712         add     $C,@X[2],$C
 713         $ST     $C,[$ctx+`2*$SZ`]
 714         add     $D,@X[3],$D
 715         $ST     $D,[$ctx+`3*$SZ`]
 716         add     $E,@X[4],$E
 717         $ST     $E,[$ctx+`4*$SZ`]
 718         add     $F,@X[5],$F
 719         $ST     $F,[$ctx+`5*$SZ`]
 720         add     $G,@X[6],$G
 721         $ST     $G,[$ctx+`6*$SZ`]
 722         add     $H,@X[7],$H
 723         $ST     $H,[$ctx+`7*$SZ`]
 724 ___
 725 $code.=<<___ if ($SZ==8); # SHA512
 726         ld      [$ctx+`0*$SZ+0`],%l0
 727         ld      [$ctx+`0*$SZ+4`],%l1
 728         ld      [$ctx+`1*$SZ+0`],%l2
 729         ld      [$ctx+`1*$SZ+4`],%l3
 730         ld      [$ctx+`2*$SZ+0`],%l4
 731         ld      [$ctx+`2*$SZ+4`],%l5
 732         ld      [$ctx+`3*$SZ+0`],%l6
 733
 734         sllx    %l0,32,$tmp0
 735         ld      [$ctx+`3*$SZ+4`],%l7
 736         sllx    %l2,32,$tmp1
 737         or      %l1,$tmp0,$tmp0
 738         or      %l3,$tmp1,$tmp1
 739         add     $tmp0,$A,$A
 740         add     $tmp1,$B,$B
 741         $ST     $A,[$ctx+`0*$SZ`]
 742         sllx    %l4,32,$tmp2
 743         $ST     $B,[$ctx+`1*$SZ`]
 744         sllx    %l6,32,$T1
 745         or      %l5,$tmp2,$tmp2
 746         or      %l7,$T1,$T1
 747         add     $tmp2,$C,$C
 748         $ST     $C,[$ctx+`2*$SZ`]
 749         add     $T1,$D,$D
 750         $ST     $D,[$ctx+`3*$SZ`]
 751
 752         ld      [$ctx+`4*$SZ+0`],%l0
 753         ld      [$ctx+`4*$SZ+4`],%l1
 754         ld      [$ctx+`5*$SZ+0`],%l2
 755         ld      [$ctx+`5*$SZ+4`],%l3
 756         ld      [$ctx+`6*$SZ+0`],%l4
 757         ld      [$ctx+`6*$SZ+4`],%l5
 758         ld      [$ctx+`7*$SZ+0`],%l6
 759
 760         sllx    %l0,32,$tmp0
 761         ld      [$ctx+`7*$SZ+4`],%l7
 762         sllx    %l2,32,$tmp1
 763         or      %l1,$tmp0,$tmp0
 764         or      %l3,$tmp1,$tmp1
 765         add     $tmp0,$E,$E
 766         add     $tmp1,$F,$F
 767         $ST     $E,[$ctx+`4*$SZ`]
 768         sllx    %l4,32,$tmp2
 769         $ST     $F,[$ctx+`5*$SZ`]
 770         sllx    %l6,32,$T1
 771         or      %l5,$tmp2,$tmp2
 772         or      %l7,$T1,$T1
 773         add     $tmp2,$G,$G
 774         $ST     $G,[$ctx+`6*$SZ`]
 775         add     $T1,$H,$H
 776         $ST     $H,[$ctx+`7*$SZ`]
 777 ___
 778 $code.=<<___;
 779         add     $inp,`16*$SZ`,$inp              ! advance inp
 780         cmp     $inp,$len
 781         bne     SIZE_T_CC,.Lloop
 782         sub     $Ktbl,`($rounds-16)*$SZ`,$Ktbl  ! rewind Ktbl
 783
 784         ret
 785         restore
 786 .type   sha${label}_block_data_order,#function
 787 .size   sha${label}_block_data_order,(.-sha${label}_block_data_order)
 788 .asciz  "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
 789 .align  4
 790 ___
 791
 792 # Purpose of these subroutines is to explicitly encode VIS instructions,
 793 # so that one can compile the module without having to specify VIS
 794 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 795 # Idea is to reserve for option to produce "universal" binary and let
 796 # programmer detect if current CPU is VIS capable at run-time.
 797 sub unvis {
 798 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 799 my $ref,$opf;
 800 my %visopf = (  "faligndata"    => 0x048,
 801                 "for"           => 0x07c        );
 802
 803     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 804
 805     if ($opf=$visopf{$mnemonic}) {
 806         foreach ($rs1,$rs2,$rd) {
 807             return $ref if (!/%f([0-9]{1,2})/);
 808             $_=$1;
 809             if ($1>=32) {
 810                 return $ref if ($1&1);
 811                 # re-encode for upper double register addressing
 812                 $_=($1|$1>>5)&31;
 813             }
 814         }
 815
 816         return  sprintf ".word\t0x%08x !%s",
 817                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 818                         $ref;
 819     } else {
 820         return $ref;
 821     }
 822 }
 823 sub unalignaddr {
 824 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 825 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 826 my $ref="$mnemonic\t$rs1,$rs2,$rd";
 827
 828     foreach ($rs1,$rs2,$rd) {
 829         if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
 830         else                    { return $ref; }
 831     }
 832     return  sprintf ".word\t0x%08x !%s",
 833                     0x81b00300|$rd<<25|$rs1<<14|$rs2,
 834                     $ref;
 835 }
 836
 837 foreach (split("\n",$code)) {
 838         s/\`([^\`]*)\`/eval $1/ge;
 839
 840         s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
 841                 &unvis($1,$2,$3,$4)
 842          /ge;
 843         s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
 844                 &unalignaddr($1,$2,$3,$4)
 845          /ge;
 846
 847         print $_,"\n";
 848 }
 849
 850 close STDOUT;