lib/libcrypto/ec/asm/ecp_nistz256-sparcv9.pl

   1 #! /usr/bin/env perl
   2 # $OpenBSD: ecp_nistz256-sparcv9.pl,v 1.1 2016/11/04 17:33:20 miod Exp $
   3 #
   4 # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
   5 #
   6 # Licensed under the OpenSSL license (the "License").  You may not use
   7 # this file except in compliance with the License.  You can obtain a copy
   8 # in the file LICENSE in the source distribution or at
   9 # https://www.openssl.org/source/license.html
  10
  11
  12 # ====================================================================
  13 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  14 # project. The module is, however, dual licensed under OpenSSL and
  15 # CRYPTOGAMS licenses depending on where you obtain it. For further
  16 # details see http://www.openssl.org/~appro/cryptogams/.
  17 # ====================================================================
  18 #
  19 # ECP_NISTZ256 module for SPARCv9.
  20 #
  21 # February 2015.
  22 #
  23 # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
  24 # http://eprint.iacr.org/2013/816. In the process of adaptation
  25 # original .c module was made 32-bit savvy in order to make this
  26 # implementation possible.
  27 #
  28 #                       with/without -DECP_NISTZ256_ASM
  29 # UltraSPARC III        +12-18%
  30 # SPARC T4              +99-550% (+66-150% on 32-bit Solaris)
  31 #
  32 # Ranges denote minimum and maximum improvement coefficients depending
  33 # on benchmark. Lower coefficients are for ECDSA sign, server-side
  34 # operation. Keep in mind that +200% means 3x improvement.
  35
  36 # Uncomment when all sparcv9 assembly generators are updated to take the output
  37 # file as last argument...
  38 # $output = pop;
  39 # open STDOUT,">$output";
  40
  41 $code.=<<___;
  42 #define STACK_FRAME     192
  43 #define STACK_BIAS      2047
  44
  45 #define LOCALS  (STACK_BIAS+STACK_FRAME)
  46 .register       %g2,#scratch
  47 .register       %g3,#scratch
  48 # define STACK64_FRAME  STACK_FRAME
  49 # define LOCALS64       LOCALS
  50
  51 .section        ".text",#alloc,#execinstr
  52 ___
  53
  54 {{{
  55 my ($rp,$ap,$bp)=map("%i$_",(0..2));
  56 my @acc=map("%l$_",(0..7));
  57 my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7)=(map("%o$_",(0..5)),"%g4","%g5");
  58 my ($bi,$a0,$mask,$carry)=(map("%i$_",(3..5)),"%g1");
  59 my ($rp_real,$ap_real)=("%g2","%g3");
  60
  61 $code.=<<___;
  62 .align  64
  63 .Lone:
  64 .long   1,0,0,0,0,0,0,0
  65
  66 ! void  ecp_nistz256_from_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
  67 .globl  ecp_nistz256_from_mont
  68 .align  32
  69 ecp_nistz256_from_mont:
  70         save    %sp,-STACK_FRAME,%sp
  71         nop
  72 1:      call    .+8
  73         add     %o7,.Lone-1b,$bp
  74         call    __ecp_nistz256_mul_mont
  75         nop
  76         ret
  77         restore
  78 .type   ecp_nistz256_from_mont,#function
  79 .size   ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
  80
  81 ! void  ecp_nistz256_mul_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8],
  82 !                                             const BN_ULONG %i2[8]);
  83 .globl  ecp_nistz256_mul_mont
  84 .align  32
  85 ecp_nistz256_mul_mont:
  86         save    %sp,-STACK_FRAME,%sp
  87         nop
  88         call    __ecp_nistz256_mul_mont
  89         nop
  90         ret
  91         restore
  92 .type   ecp_nistz256_mul_mont,#function
  93 .size   ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
  94
  95 ! void  ecp_nistz256_sqr_mont(BN_ULONG %i0[8],const BN_ULONG %i2[8]);
  96 .globl  ecp_nistz256_sqr_mont
  97 .align  32
  98 ecp_nistz256_sqr_mont:
  99         save    %sp,-STACK_FRAME,%sp
 100         mov     $ap,$bp
 101         call    __ecp_nistz256_mul_mont
 102         nop
 103         ret
 104         restore
 105 .type   ecp_nistz256_sqr_mont,#function
 106 .size   ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
 107 ___
 108
 109 ########################################################################
 110 # Special thing to keep in mind is that $t0-$t7 hold 64-bit values,
 111 # while all others are meant to keep 32. "Meant to" means that additions
 112 # to @acc[0-7] do "contaminate" upper bits, but they are cleared before
 113 # they can affect outcome (follow 'and' with $mask). Also keep in mind
 114 # that addition with carry is addition with 32-bit carry, even though
 115 # CPU is 64-bit. [Addition with 64-bit carry was introduced in T3, see
 116 # below for VIS3 code paths.]
 117
 118 $code.=<<___;
 119 .align  32
 120 __ecp_nistz256_mul_mont:
 121         ld      [$bp+0],$bi             ! b[0]
 122         mov     -1,$mask
 123         ld      [$ap+0],$a0
 124         srl     $mask,0,$mask           ! 0xffffffff
 125         ld      [$ap+4],$t1
 126         ld      [$ap+8],$t2
 127         ld      [$ap+12],$t3
 128         ld      [$ap+16],$t4
 129         ld      [$ap+20],$t5
 130         ld      [$ap+24],$t6
 131         ld      [$ap+28],$t7
 132         mulx    $a0,$bi,$t0             ! a[0-7]*b[0], 64-bit results
 133         mulx    $t1,$bi,$t1
 134         mulx    $t2,$bi,$t2
 135         mulx    $t3,$bi,$t3
 136         mulx    $t4,$bi,$t4
 137         mulx    $t5,$bi,$t5
 138         mulx    $t6,$bi,$t6
 139         mulx    $t7,$bi,$t7
 140         srlx    $t0,32,@acc[1]          ! extract high parts
 141         srlx    $t1,32,@acc[2]
 142         srlx    $t2,32,@acc[3]
 143         srlx    $t3,32,@acc[4]
 144         srlx    $t4,32,@acc[5]
 145         srlx    $t5,32,@acc[6]
 146         srlx    $t6,32,@acc[7]
 147         srlx    $t7,32,@acc[0]          ! "@acc[8]"
 148         mov     0,$carry
 149 ___
 150 for($i=1;$i<8;$i++) {
 151 $code.=<<___;
 152         addcc   @acc[1],$t1,@acc[1]     ! accumulate high parts
 153         ld      [$bp+4*$i],$bi          ! b[$i]
 154         ld      [$ap+4],$t1             ! re-load a[1-7]
 155         addccc  @acc[2],$t2,@acc[2]
 156         addccc  @acc[3],$t3,@acc[3]
 157         ld      [$ap+8],$t2
 158         ld      [$ap+12],$t3
 159         addccc  @acc[4],$t4,@acc[4]
 160         addccc  @acc[5],$t5,@acc[5]
 161         ld      [$ap+16],$t4
 162         ld      [$ap+20],$t5
 163         addccc  @acc[6],$t6,@acc[6]
 164         addccc  @acc[7],$t7,@acc[7]
 165         ld      [$ap+24],$t6
 166         ld      [$ap+28],$t7
 167         addccc  @acc[0],$carry,@acc[0]  ! "@acc[8]"
 168         addc    %g0,%g0,$carry
 169 ___
 170         # Reduction iteration is normally performed by accumulating
 171         # result of multiplication of modulus by "magic" digit [and
 172         # omitting least significant word, which is guaranteed to
 173         # be 0], but thanks to special form of modulus and "magic"
 174         # digit being equal to least significant word, it can be
 175         # performed with additions and subtractions alone. Indeed:
 176         #
 177         #        ffff.0001.0000.0000.0000.ffff.ffff.ffff
 178         # *                                         abcd
 179         # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
 180         #
 181         # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
 182         # rewrite above as:
 183         #
 184         #   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
 185         # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
 186         # -      abcd.0000.0000.0000.0000.0000.0000.abcd
 187         #
 188         # or marking redundant operations:
 189         #
 190         #   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
 191         # + abcd.0000.abcd.0000.0000.abcd.----.----.----
 192         # -      abcd.----.----.----.----.----.----.----
 193
 194 $code.=<<___;
 195         ! multiplication-less reduction
 196         addcc   @acc[3],$t0,@acc[3]     ! r[3]+=r[0]
 197         addccc  @acc[4],%g0,@acc[4]     ! r[4]+=0
 198          and    @acc[1],$mask,@acc[1]
 199          and    @acc[2],$mask,@acc[2]
 200         addccc  @acc[5],%g0,@acc[5]     ! r[5]+=0
 201         addccc  @acc[6],$t0,@acc[6]     ! r[6]+=r[0]
 202          and    @acc[3],$mask,@acc[3]
 203          and    @acc[4],$mask,@acc[4]
 204         addccc  @acc[7],%g0,@acc[7]     ! r[7]+=0
 205         addccc  @acc[0],$t0,@acc[0]     ! r[8]+=r[0]    "@acc[8]"
 206          and    @acc[5],$mask,@acc[5]
 207          and    @acc[6],$mask,@acc[6]
 208         addc    $carry,%g0,$carry       ! top-most carry
 209         subcc   @acc[7],$t0,@acc[7]     ! r[7]-=r[0]
 210         subccc  @acc[0],%g0,@acc[0]     ! r[8]-=0       "@acc[8]"
 211         subc    $carry,%g0,$carry       ! top-most carry
 212          and    @acc[7],$mask,@acc[7]
 213          and    @acc[0],$mask,@acc[0]   ! "@acc[8]"
 214 ___
 215         push(@acc,shift(@acc));         # rotate registers to "omit" acc[0]
 216 $code.=<<___;
 217         mulx    $a0,$bi,$t0             ! a[0-7]*b[$i], 64-bit results
 218         mulx    $t1,$bi,$t1
 219         mulx    $t2,$bi,$t2
 220         mulx    $t3,$bi,$t3
 221         mulx    $t4,$bi,$t4
 222         mulx    $t5,$bi,$t5
 223         mulx    $t6,$bi,$t6
 224         mulx    $t7,$bi,$t7
 225         add     @acc[0],$t0,$t0         ! accumulate low parts, can't overflow
 226         add     @acc[1],$t1,$t1
 227         srlx    $t0,32,@acc[1]          ! extract high parts
 228         add     @acc[2],$t2,$t2
 229         srlx    $t1,32,@acc[2]
 230         add     @acc[3],$t3,$t3
 231         srlx    $t2,32,@acc[3]
 232         add     @acc[4],$t4,$t4
 233         srlx    $t3,32,@acc[4]
 234         add     @acc[5],$t5,$t5
 235         srlx    $t4,32,@acc[5]
 236         add     @acc[6],$t6,$t6
 237         srlx    $t5,32,@acc[6]
 238         add     @acc[7],$t7,$t7
 239         srlx    $t6,32,@acc[7]
 240         srlx    $t7,32,@acc[0]          ! "@acc[8]"
 241 ___
 242 }
 243 $code.=<<___;
 244         addcc   @acc[1],$t1,@acc[1]     ! accumulate high parts
 245         addccc  @acc[2],$t2,@acc[2]
 246         addccc  @acc[3],$t3,@acc[3]
 247         addccc  @acc[4],$t4,@acc[4]
 248         addccc  @acc[5],$t5,@acc[5]
 249         addccc  @acc[6],$t6,@acc[6]
 250         addccc  @acc[7],$t7,@acc[7]
 251         addccc  @acc[0],$carry,@acc[0]  ! "@acc[8]"
 252         addc    %g0,%g0,$carry
 253
 254         addcc   @acc[3],$t0,@acc[3]     ! multiplication-less reduction
 255         addccc  @acc[4],%g0,@acc[4]
 256         addccc  @acc[5],%g0,@acc[5]
 257         addccc  @acc[6],$t0,@acc[6]
 258         addccc  @acc[7],%g0,@acc[7]
 259         addccc  @acc[0],$t0,@acc[0]     ! "@acc[8]"
 260         addc    $carry,%g0,$carry
 261         subcc   @acc[7],$t0,@acc[7]
 262         subccc  @acc[0],%g0,@acc[0]     ! "@acc[8]"
 263         subc    $carry,%g0,$carry       ! top-most carry
 264 ___
 265         push(@acc,shift(@acc));         # rotate registers to omit acc[0]
 266 $code.=<<___;
 267         ! Final step is "if result > mod, subtract mod", but we do it
 268         ! "other way around", namely subtract modulus from result
 269         ! and if it borrowed, add modulus back.
 270
 271         subcc   @acc[0],-1,@acc[0]      ! subtract modulus
 272         subccc  @acc[1],-1,@acc[1]
 273         subccc  @acc[2],-1,@acc[2]
 274         subccc  @acc[3],0,@acc[3]
 275         subccc  @acc[4],0,@acc[4]
 276         subccc  @acc[5],0,@acc[5]
 277         subccc  @acc[6],1,@acc[6]
 278         subccc  @acc[7],-1,@acc[7]
 279         subc    $carry,0,$carry         ! broadcast borrow bit
 280
 281         ! Note that because mod has special form, i.e. consists of
 282         ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
 283         ! using value of broadcasted borrow and the borrow bit itself.
 284         ! To minimize dependency chain we first broadcast and then
 285         ! extract the bit by negating (follow $bi).
 286
 287         addcc   @acc[0],$carry,@acc[0]  ! add modulus or zero
 288         addccc  @acc[1],$carry,@acc[1]
 289         neg     $carry,$bi
 290         st      @acc[0],[$rp]
 291         addccc  @acc[2],$carry,@acc[2]
 292         st      @acc[1],[$rp+4]
 293         addccc  @acc[3],0,@acc[3]
 294         st      @acc[2],[$rp+8]
 295         addccc  @acc[4],0,@acc[4]
 296         st      @acc[3],[$rp+12]
 297         addccc  @acc[5],0,@acc[5]
 298         st      @acc[4],[$rp+16]
 299         addccc  @acc[6],$bi,@acc[6]
 300         st      @acc[5],[$rp+20]
 301         addc    @acc[7],$carry,@acc[7]
 302         st      @acc[6],[$rp+24]
 303         retl
 304         st      @acc[7],[$rp+28]
 305 .type   __ecp_nistz256_mul_mont,#function
 306 .size   __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
 307
 308 ! void  ecp_nistz256_add(BN_ULONG %i0[8],const BN_ULONG %i1[8],
 309 !                                        const BN_ULONG %i2[8]);
 310 .globl  ecp_nistz256_add
 311 .align  32
 312 ecp_nistz256_add:
 313         save    %sp,-STACK_FRAME,%sp
 314         ld      [$ap],@acc[0]
 315         ld      [$ap+4],@acc[1]
 316         ld      [$ap+8],@acc[2]
 317         ld      [$ap+12],@acc[3]
 318         ld      [$ap+16],@acc[4]
 319         ld      [$ap+20],@acc[5]
 320         ld      [$ap+24],@acc[6]
 321         call    __ecp_nistz256_add
 322         ld      [$ap+28],@acc[7]
 323         ret
 324         restore
 325 .type   ecp_nistz256_add,#function
 326 .size   ecp_nistz256_add,.-ecp_nistz256_add
 327
 328 .align  32
 329 __ecp_nistz256_add:
 330         ld      [$bp+0],$t0             ! b[0]
 331         ld      [$bp+4],$t1
 332         ld      [$bp+8],$t2
 333         ld      [$bp+12],$t3
 334         addcc   @acc[0],$t0,@acc[0]
 335         ld      [$bp+16],$t4
 336         ld      [$bp+20],$t5
 337         addccc  @acc[1],$t1,@acc[1]
 338         ld      [$bp+24],$t6
 339         ld      [$bp+28],$t7
 340         addccc  @acc[2],$t2,@acc[2]
 341         addccc  @acc[3],$t3,@acc[3]
 342         addccc  @acc[4],$t4,@acc[4]
 343         addccc  @acc[5],$t5,@acc[5]
 344         addccc  @acc[6],$t6,@acc[6]
 345         addccc  @acc[7],$t7,@acc[7]
 346         addc    %g0,%g0,$carry
 347
 348 .Lreduce_by_sub:
 349
 350         ! if a+b >= modulus, subtract modulus.
 351         !
 352         ! But since comparison implies subtraction, we subtract
 353         ! modulus and then add it back if subraction borrowed.
 354
 355         subcc   @acc[0],-1,@acc[0]
 356         subccc  @acc[1],-1,@acc[1]
 357         subccc  @acc[2],-1,@acc[2]
 358         subccc  @acc[3], 0,@acc[3]
 359         subccc  @acc[4], 0,@acc[4]
 360         subccc  @acc[5], 0,@acc[5]
 361         subccc  @acc[6], 1,@acc[6]
 362         subccc  @acc[7],-1,@acc[7]
 363         subc    $carry,0,$carry
 364
 365         ! Note that because mod has special form, i.e. consists of
 366         ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
 367         ! using value of borrow and its negative.
 368
 369         addcc   @acc[0],$carry,@acc[0]  ! add synthesized modulus
 370         addccc  @acc[1],$carry,@acc[1]
 371         neg     $carry,$bi
 372         st      @acc[0],[$rp]
 373         addccc  @acc[2],$carry,@acc[2]
 374         st      @acc[1],[$rp+4]
 375         addccc  @acc[3],0,@acc[3]
 376         st      @acc[2],[$rp+8]
 377         addccc  @acc[4],0,@acc[4]
 378         st      @acc[3],[$rp+12]
 379         addccc  @acc[5],0,@acc[5]
 380         st      @acc[4],[$rp+16]
 381         addccc  @acc[6],$bi,@acc[6]
 382         st      @acc[5],[$rp+20]
 383         addc    @acc[7],$carry,@acc[7]
 384         st      @acc[6],[$rp+24]
 385         retl
 386         st      @acc[7],[$rp+28]
 387 .type   __ecp_nistz256_add,#function
 388 .size   __ecp_nistz256_add,.-__ecp_nistz256_add
 389
 390 ! void  ecp_nistz256_mul_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
 391 .globl  ecp_nistz256_mul_by_2
 392 .align  32
 393 ecp_nistz256_mul_by_2:
 394         save    %sp,-STACK_FRAME,%sp
 395         ld      [$ap],@acc[0]
 396         ld      [$ap+4],@acc[1]
 397         ld      [$ap+8],@acc[2]
 398         ld      [$ap+12],@acc[3]
 399         ld      [$ap+16],@acc[4]
 400         ld      [$ap+20],@acc[5]
 401         ld      [$ap+24],@acc[6]
 402         call    __ecp_nistz256_mul_by_2
 403         ld      [$ap+28],@acc[7]
 404         ret
 405         restore
 406 .type   ecp_nistz256_mul_by_2,#function
 407 .size   ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
 408
 409 .align  32
 410 __ecp_nistz256_mul_by_2:
 411         addcc   @acc[0],@acc[0],@acc[0] ! a+a=2*a
 412         addccc  @acc[1],@acc[1],@acc[1]
 413         addccc  @acc[2],@acc[2],@acc[2]
 414         addccc  @acc[3],@acc[3],@acc[3]
 415         addccc  @acc[4],@acc[4],@acc[4]
 416         addccc  @acc[5],@acc[5],@acc[5]
 417         addccc  @acc[6],@acc[6],@acc[6]
 418         addccc  @acc[7],@acc[7],@acc[7]
 419         b       .Lreduce_by_sub
 420         addc    %g0,%g0,$carry
 421 .type   __ecp_nistz256_mul_by_2,#function
 422 .size   __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
 423
 424 ! void  ecp_nistz256_mul_by_3(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
 425 .globl  ecp_nistz256_mul_by_3
 426 .align  32
 427 ecp_nistz256_mul_by_3:
 428         save    %sp,-STACK_FRAME,%sp
 429         ld      [$ap],@acc[0]
 430         ld      [$ap+4],@acc[1]
 431         ld      [$ap+8],@acc[2]
 432         ld      [$ap+12],@acc[3]
 433         ld      [$ap+16],@acc[4]
 434         ld      [$ap+20],@acc[5]
 435         ld      [$ap+24],@acc[6]
 436         call    __ecp_nistz256_mul_by_3
 437         ld      [$ap+28],@acc[7]
 438         ret
 439         restore
 440 .type   ecp_nistz256_mul_by_3,#function
 441 .size   ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
 442
 443 .align  32
 444 __ecp_nistz256_mul_by_3:
 445         addcc   @acc[0],@acc[0],$t0     ! a+a=2*a
 446         addccc  @acc[1],@acc[1],$t1
 447         addccc  @acc[2],@acc[2],$t2
 448         addccc  @acc[3],@acc[3],$t3
 449         addccc  @acc[4],@acc[4],$t4
 450         addccc  @acc[5],@acc[5],$t5
 451         addccc  @acc[6],@acc[6],$t6
 452         addccc  @acc[7],@acc[7],$t7
 453         addc    %g0,%g0,$carry
 454
 455         subcc   $t0,-1,$t0              ! .Lreduce_by_sub but without stores
 456         subccc  $t1,-1,$t1
 457         subccc  $t2,-1,$t2
 458         subccc  $t3, 0,$t3
 459         subccc  $t4, 0,$t4
 460         subccc  $t5, 0,$t5
 461         subccc  $t6, 1,$t6
 462         subccc  $t7,-1,$t7
 463         subc    $carry,0,$carry
 464
 465         addcc   $t0,$carry,$t0          ! add synthesized modulus
 466         addccc  $t1,$carry,$t1
 467         neg     $carry,$bi
 468         addccc  $t2,$carry,$t2
 469         addccc  $t3,0,$t3
 470         addccc  $t4,0,$t4
 471         addccc  $t5,0,$t5
 472         addccc  $t6,$bi,$t6
 473         addc    $t7,$carry,$t7
 474
 475         addcc   $t0,@acc[0],@acc[0]     ! 2*a+a=3*a
 476         addccc  $t1,@acc[1],@acc[1]
 477         addccc  $t2,@acc[2],@acc[2]
 478         addccc  $t3,@acc[3],@acc[3]
 479         addccc  $t4,@acc[4],@acc[4]
 480         addccc  $t5,@acc[5],@acc[5]
 481         addccc  $t6,@acc[6],@acc[6]
 482         addccc  $t7,@acc[7],@acc[7]
 483         b       .Lreduce_by_sub
 484         addc    %g0,%g0,$carry
 485 .type   __ecp_nistz256_mul_by_3,#function
 486 .size   __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3
 487
 488 ! void  ecp_nistz256_neg(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
 489 .globl  ecp_nistz256_neg
 490 .align  32
 491 ecp_nistz256_neg:
 492         save    %sp,-STACK_FRAME,%sp
 493         mov     $ap,$bp
 494         mov     0,@acc[0]
 495         mov     0,@acc[1]
 496         mov     0,@acc[2]
 497         mov     0,@acc[3]
 498         mov     0,@acc[4]
 499         mov     0,@acc[5]
 500         mov     0,@acc[6]
 501         call    __ecp_nistz256_sub_from
 502         mov     0,@acc[7]
 503         ret
 504         restore
 505 .type   ecp_nistz256_neg,#function
 506 .size   ecp_nistz256_neg,.-ecp_nistz256_neg
 507
 508 .align  32
 509 __ecp_nistz256_sub_from:
 510         ld      [$bp+0],$t0             ! b[0]
 511         ld      [$bp+4],$t1
 512         ld      [$bp+8],$t2
 513         ld      [$bp+12],$t3
 514         subcc   @acc[0],$t0,@acc[0]
 515         ld      [$bp+16],$t4
 516         ld      [$bp+20],$t5
 517         subccc  @acc[1],$t1,@acc[1]
 518         subccc  @acc[2],$t2,@acc[2]
 519         ld      [$bp+24],$t6
 520         ld      [$bp+28],$t7
 521         subccc  @acc[3],$t3,@acc[3]
 522         subccc  @acc[4],$t4,@acc[4]
 523         subccc  @acc[5],$t5,@acc[5]
 524         subccc  @acc[6],$t6,@acc[6]
 525         subccc  @acc[7],$t7,@acc[7]
 526         subc    %g0,%g0,$carry          ! broadcast borrow bit
 527
 528 .Lreduce_by_add:
 529
 530         ! if a-b borrows, add modulus.
 531         !
 532         ! Note that because mod has special form, i.e. consists of
 533         ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
 534         ! using value of broadcasted borrow and the borrow bit itself.
 535         ! To minimize dependency chain we first broadcast and then
 536         ! extract the bit by negating (follow $bi).
 537
 538         addcc   @acc[0],$carry,@acc[0]  ! add synthesized modulus
 539         addccc  @acc[1],$carry,@acc[1]
 540         neg     $carry,$bi
 541         st      @acc[0],[$rp]
 542         addccc  @acc[2],$carry,@acc[2]
 543         st      @acc[1],[$rp+4]
 544         addccc  @acc[3],0,@acc[3]
 545         st      @acc[2],[$rp+8]
 546         addccc  @acc[4],0,@acc[4]
 547         st      @acc[3],[$rp+12]
 548         addccc  @acc[5],0,@acc[5]
 549         st      @acc[4],[$rp+16]
 550         addccc  @acc[6],$bi,@acc[6]
 551         st      @acc[5],[$rp+20]
 552         addc    @acc[7],$carry,@acc[7]
 553         st      @acc[6],[$rp+24]
 554         retl
 555         st      @acc[7],[$rp+28]
 556 .type   __ecp_nistz256_sub_from,#function
 557 .size   __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
 558
 559 .align  32
 560 __ecp_nistz256_sub_morf:
 561         ld      [$bp+0],$t0             ! b[0]
 562         ld      [$bp+4],$t1
 563         ld      [$bp+8],$t2
 564         ld      [$bp+12],$t3
 565         subcc   $t0,@acc[0],@acc[0]
 566         ld      [$bp+16],$t4
 567         ld      [$bp+20],$t5
 568         subccc  $t1,@acc[1],@acc[1]
 569         subccc  $t2,@acc[2],@acc[2]
 570         ld      [$bp+24],$t6
 571         ld      [$bp+28],$t7
 572         subccc  $t3,@acc[3],@acc[3]
 573         subccc  $t4,@acc[4],@acc[4]
 574         subccc  $t5,@acc[5],@acc[5]
 575         subccc  $t6,@acc[6],@acc[6]
 576         subccc  $t7,@acc[7],@acc[7]
 577         b       .Lreduce_by_add
 578         subc    %g0,%g0,$carry          ! broadcast borrow bit
 579 .type   __ecp_nistz256_sub_morf,#function
 580 .size   __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
 581
 582 ! void  ecp_nistz256_div_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
 583 .globl  ecp_nistz256_div_by_2
 584 .align  32
 585 ecp_nistz256_div_by_2:
 586         save    %sp,-STACK_FRAME,%sp
 587         ld      [$ap],@acc[0]
 588         ld      [$ap+4],@acc[1]
 589         ld      [$ap+8],@acc[2]
 590         ld      [$ap+12],@acc[3]
 591         ld      [$ap+16],@acc[4]
 592         ld      [$ap+20],@acc[5]
 593         ld      [$ap+24],@acc[6]
 594         call    __ecp_nistz256_div_by_2
 595         ld      [$ap+28],@acc[7]
 596         ret
 597         restore
 598 .type   ecp_nistz256_div_by_2,#function
 599 .size   ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
 600
 601 .align  32
 602 __ecp_nistz256_div_by_2:
 603         ! ret = (a is odd ? a+mod : a) >> 1
 604
 605         and     @acc[0],1,$bi
 606         neg     $bi,$carry
 607         addcc   @acc[0],$carry,@acc[0]
 608         addccc  @acc[1],$carry,@acc[1]
 609         addccc  @acc[2],$carry,@acc[2]
 610         addccc  @acc[3],0,@acc[3]
 611         addccc  @acc[4],0,@acc[4]
 612         addccc  @acc[5],0,@acc[5]
 613         addccc  @acc[6],$bi,@acc[6]
 614         addccc  @acc[7],$carry,@acc[7]
 615         addc    %g0,%g0,$carry
 616
 617         ! ret >>= 1
 618
 619         srl     @acc[0],1,@acc[0]
 620         sll     @acc[1],31,$t0
 621         srl     @acc[1],1,@acc[1]
 622         or      @acc[0],$t0,@acc[0]
 623         sll     @acc[2],31,$t1
 624         srl     @acc[2],1,@acc[2]
 625         or      @acc[1],$t1,@acc[1]
 626         sll     @acc[3],31,$t2
 627         st      @acc[0],[$rp]
 628         srl     @acc[3],1,@acc[3]
 629         or      @acc[2],$t2,@acc[2]
 630         sll     @acc[4],31,$t3
 631         st      @acc[1],[$rp+4]
 632         srl     @acc[4],1,@acc[4]
 633         or      @acc[3],$t3,@acc[3]
 634         sll     @acc[5],31,$t4
 635         st      @acc[2],[$rp+8]
 636         srl     @acc[5],1,@acc[5]
 637         or      @acc[4],$t4,@acc[4]
 638         sll     @acc[6],31,$t5
 639         st      @acc[3],[$rp+12]
 640         srl     @acc[6],1,@acc[6]
 641         or      @acc[5],$t5,@acc[5]
 642         sll     @acc[7],31,$t6
 643         st      @acc[4],[$rp+16]
 644         srl     @acc[7],1,@acc[7]
 645         or      @acc[6],$t6,@acc[6]
 646         sll     $carry,31,$t7
 647         st      @acc[5],[$rp+20]
 648         or      @acc[7],$t7,@acc[7]
 649         st      @acc[6],[$rp+24]
 650         retl
 651         st      @acc[7],[$rp+28]
 652 .type   __ecp_nistz256_div_by_2,#function
 653 .size   __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
 654 ___
 655
 656 ########################################################################
 657 # following subroutines are "literal" implementation of those found in
 658 # ecp_nistz256.c
 659 #
 660 ########################################################################
 661 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
 662 #
 663 {
 664 my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
 665 # above map() describes stack layout with 4 temporary
 666 # 256-bit vectors on top.
 667
 668 $code.=<<___;
 669 #if 0
 670 #ifdef __PIC__
 671 SPARC_PIC_THUNK(%g1)
 672 #endif
 673 #endif
 674
 675 .globl  ecp_nistz256_point_double
 676 .align  32
 677 ecp_nistz256_point_double:
 678 #if 0
 679         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
 680         ld      [%g1],%g1               ! OPENSSL_sparcv9cap_P[0]
 681         and     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
 682         cmp     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
 683         be      ecp_nistz256_point_double_vis3
 684         nop
 685 #endif
 686
 687         save    %sp,-STACK_FRAME-32*4,%sp
 688
 689         mov     $rp,$rp_real
 690         mov     $ap,$ap_real
 691
 692 .Lpoint_double_shortcut:
 693         ld      [$ap+32],@acc[0]
 694         ld      [$ap+32+4],@acc[1]
 695         ld      [$ap+32+8],@acc[2]
 696         ld      [$ap+32+12],@acc[3]
 697         ld      [$ap+32+16],@acc[4]
 698         ld      [$ap+32+20],@acc[5]
 699         ld      [$ap+32+24],@acc[6]
 700         ld      [$ap+32+28],@acc[7]
 701         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(S, in_y);
 702         add     %sp,LOCALS+$S,$rp
 703
 704         add     $ap_real,64,$bp
 705         add     $ap_real,64,$ap
 706         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Zsqr, in_z);
 707         add     %sp,LOCALS+$Zsqr,$rp
 708
 709         add     $ap_real,0,$bp
 710         call    __ecp_nistz256_add      ! p256_add(M, Zsqr, in_x);
 711         add     %sp,LOCALS+$M,$rp
 712
 713         add     %sp,LOCALS+$S,$bp
 714         add     %sp,LOCALS+$S,$ap
 715         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(S, S);
 716         add     %sp,LOCALS+$S,$rp
 717
 718         ld      [$ap_real],@acc[0]
 719         add     %sp,LOCALS+$Zsqr,$bp
 720         ld      [$ap_real+4],@acc[1]
 721         ld      [$ap_real+8],@acc[2]
 722         ld      [$ap_real+12],@acc[3]
 723         ld      [$ap_real+16],@acc[4]
 724         ld      [$ap_real+20],@acc[5]
 725         ld      [$ap_real+24],@acc[6]
 726         ld      [$ap_real+28],@acc[7]
 727         call    __ecp_nistz256_sub_from ! p256_sub(Zsqr, in_x, Zsqr);
 728         add     %sp,LOCALS+$Zsqr,$rp
 729
 730         add     $ap_real,32,$bp
 731         add     $ap_real,64,$ap
 732         call    __ecp_nistz256_mul_mont ! p256_mul_mont(tmp0, in_z, in_y);
 733         add     %sp,LOCALS+$tmp0,$rp
 734
 735         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(res_z, tmp0);
 736         add     $rp_real,64,$rp
 737
 738         add     %sp,LOCALS+$Zsqr,$bp
 739         add     %sp,LOCALS+$M,$ap
 740         call    __ecp_nistz256_mul_mont ! p256_mul_mont(M, M, Zsqr);
 741         add     %sp,LOCALS+$M,$rp
 742
 743         call    __ecp_nistz256_mul_by_3 ! p256_mul_by_3(M, M);
 744         add     %sp,LOCALS+$M,$rp
 745
 746         add     %sp,LOCALS+$S,$bp
 747         add     %sp,LOCALS+$S,$ap
 748         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(tmp0, S);
 749         add     %sp,LOCALS+$tmp0,$rp
 750
 751         call    __ecp_nistz256_div_by_2 ! p256_div_by_2(res_y, tmp0);
 752         add     $rp_real,32,$rp
 753
 754         add     $ap_real,0,$bp
 755         add     %sp,LOCALS+$S,$ap
 756         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, in_x);
 757         add     %sp,LOCALS+$S,$rp
 758
 759         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(tmp0, S);
 760         add     %sp,LOCALS+$tmp0,$rp
 761
 762         add     %sp,LOCALS+$M,$bp
 763         add     %sp,LOCALS+$M,$ap
 764         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(res_x, M);
 765         add     $rp_real,0,$rp
 766
 767         add     %sp,LOCALS+$tmp0,$bp
 768         call    __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, tmp0);
 769         add     $rp_real,0,$rp
 770
 771         add     %sp,LOCALS+$S,$bp
 772         call    __ecp_nistz256_sub_morf ! p256_sub(S, S, res_x);
 773         add     %sp,LOCALS+$S,$rp
 774
 775         add     %sp,LOCALS+$M,$bp
 776         add     %sp,LOCALS+$S,$ap
 777         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, M);
 778         add     %sp,LOCALS+$S,$rp
 779
 780         add     $rp_real,32,$bp
 781         call    __ecp_nistz256_sub_from ! p256_sub(res_y, S, res_y);
 782         add     $rp_real,32,$rp
 783
 784         ret
 785         restore
 786 .type   ecp_nistz256_point_double,#function
 787 .size   ecp_nistz256_point_double,.-ecp_nistz256_point_double
 788 ___
 789 }
 790
 791 ########################################################################
 792 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
 793 #                             const P256_POINT *in2);
 794 {
 795 my ($res_x,$res_y,$res_z,
 796     $H,$Hsqr,$R,$Rsqr,$Hcub,
 797     $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
 798 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
 799
 800 # above map() describes stack layout with 12 temporary
 801 # 256-bit vectors on top. Then we reserve some space for
 802 # !in1infty, !in2infty, result of check for zero and return pointer.
 803
 804 my $bp_real=$rp_real;
 805
 806 $code.=<<___;
 807 .globl  ecp_nistz256_point_add
 808 .align  32
 809 ecp_nistz256_point_add:
 810 #if 0
 811         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
 812         ld      [%g1],%g1               ! OPENSSL_sparcv9cap_P[0]
 813         and     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
 814         cmp     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
 815         be      ecp_nistz256_point_add_vis3
 816         nop
 817 #endif
 818
 819         save    %sp,-STACK_FRAME-32*12-32,%sp
 820
 821         stx     $rp,[%fp+STACK_BIAS-8]  ! off-load $rp
 822         mov     $ap,$ap_real
 823         mov     $bp,$bp_real
 824
 825         ld      [$bp+64],$t0            ! in2_z
 826         ld      [$bp+64+4],$t1
 827         ld      [$bp+64+8],$t2
 828         ld      [$bp+64+12],$t3
 829         ld      [$bp+64+16],$t4
 830         ld      [$bp+64+20],$t5
 831         ld      [$bp+64+24],$t6
 832         ld      [$bp+64+28],$t7
 833         or      $t1,$t0,$t0
 834         or      $t3,$t2,$t2
 835         or      $t5,$t4,$t4
 836         or      $t7,$t6,$t6
 837         or      $t2,$t0,$t0
 838         or      $t6,$t4,$t4
 839         or      $t4,$t0,$t0             ! !in2infty
 840         movrnz  $t0,-1,$t0
 841         st      $t0,[%fp+STACK_BIAS-12]
 842
 843         ld      [$ap+64],$t0            ! in1_z
 844         ld      [$ap+64+4],$t1
 845         ld      [$ap+64+8],$t2
 846         ld      [$ap+64+12],$t3
 847         ld      [$ap+64+16],$t4
 848         ld      [$ap+64+20],$t5
 849         ld      [$ap+64+24],$t6
 850         ld      [$ap+64+28],$t7
 851         or      $t1,$t0,$t0
 852         or      $t3,$t2,$t2
 853         or      $t5,$t4,$t4
 854         or      $t7,$t6,$t6
 855         or      $t2,$t0,$t0
 856         or      $t6,$t4,$t4
 857         or      $t4,$t0,$t0             ! !in1infty
 858         movrnz  $t0,-1,$t0
 859         st      $t0,[%fp+STACK_BIAS-16]
 860
 861         add     $bp_real,64,$bp
 862         add     $bp_real,64,$ap
 863         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Z2sqr, in2_z);
 864         add     %sp,LOCALS+$Z2sqr,$rp
 865
 866         add     $ap_real,64,$bp
 867         add     $ap_real,64,$ap
 868         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z);
 869         add     %sp,LOCALS+$Z1sqr,$rp
 870
 871         add     $bp_real,64,$bp
 872         add     %sp,LOCALS+$Z2sqr,$ap
 873         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S1, Z2sqr, in2_z);
 874         add     %sp,LOCALS+$S1,$rp
 875
 876         add     $ap_real,64,$bp
 877         add     %sp,LOCALS+$Z1sqr,$ap
 878         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z);
 879         add     %sp,LOCALS+$S2,$rp
 880
 881         add     $ap_real,32,$bp
 882         add     %sp,LOCALS+$S1,$ap
 883         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S1, S1, in1_y);
 884         add     %sp,LOCALS+$S1,$rp
 885
 886         add     $bp_real,32,$bp
 887         add     %sp,LOCALS+$S2,$ap
 888         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y);
 889         add     %sp,LOCALS+$S2,$rp
 890
 891         add     %sp,LOCALS+$S1,$bp
 892         call    __ecp_nistz256_sub_from ! p256_sub(R, S2, S1);
 893         add     %sp,LOCALS+$R,$rp
 894
 895         or      @acc[1],@acc[0],@acc[0] ! see if result is zero
 896         or      @acc[3],@acc[2],@acc[2]
 897         or      @acc[5],@acc[4],@acc[4]
 898         or      @acc[7],@acc[6],@acc[6]
 899         or      @acc[2],@acc[0],@acc[0]
 900         or      @acc[6],@acc[4],@acc[4]
 901         or      @acc[4],@acc[0],@acc[0]
 902         st      @acc[0],[%fp+STACK_BIAS-20]
 903
 904         add     $ap_real,0,$bp
 905         add     %sp,LOCALS+$Z2sqr,$ap
 906         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U1, in1_x, Z2sqr);
 907         add     %sp,LOCALS+$U1,$rp
 908
 909         add     $bp_real,0,$bp
 910         add     %sp,LOCALS+$Z1sqr,$ap
 911         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in2_x, Z1sqr);
 912         add     %sp,LOCALS+$U2,$rp
 913
 914         add     %sp,LOCALS+$U1,$bp
 915         call    __ecp_nistz256_sub_from ! p256_sub(H, U2, U1);
 916         add     %sp,LOCALS+$H,$rp
 917
 918         or      @acc[1],@acc[0],@acc[0] ! see if result is zero
 919         or      @acc[3],@acc[2],@acc[2]
 920         or      @acc[5],@acc[4],@acc[4]
 921         or      @acc[7],@acc[6],@acc[6]
 922         or      @acc[2],@acc[0],@acc[0]
 923         or      @acc[6],@acc[4],@acc[4]
 924         orcc    @acc[4],@acc[0],@acc[0]
 925
 926         bne,pt  %icc,.Ladd_proceed      ! is_equal(U1,U2)?
 927         nop
 928
 929         ld      [%fp+STACK_BIAS-12],$t0
 930         ld      [%fp+STACK_BIAS-16],$t1
 931         ld      [%fp+STACK_BIAS-20],$t2
 932         andcc   $t0,$t1,%g0
 933         be,pt   %icc,.Ladd_proceed      ! (in1infty || in2infty)?
 934         nop
 935         andcc   $t2,$t2,%g0
 936         be,pt   %icc,.Ladd_double       ! is_equal(S1,S2)?
 937         nop
 938
 939         ldx     [%fp+STACK_BIAS-8],$rp
 940         st      %g0,[$rp]
 941         st      %g0,[$rp+4]
 942         st      %g0,[$rp+8]
 943         st      %g0,[$rp+12]
 944         st      %g0,[$rp+16]
 945         st      %g0,[$rp+20]
 946         st      %g0,[$rp+24]
 947         st      %g0,[$rp+28]
 948         st      %g0,[$rp+32]
 949         st      %g0,[$rp+32+4]
 950         st      %g0,[$rp+32+8]
 951         st      %g0,[$rp+32+12]
 952         st      %g0,[$rp+32+16]
 953         st      %g0,[$rp+32+20]
 954         st      %g0,[$rp+32+24]
 955         st      %g0,[$rp+32+28]
 956         st      %g0,[$rp+64]
 957         st      %g0,[$rp+64+4]
 958         st      %g0,[$rp+64+8]
 959         st      %g0,[$rp+64+12]
 960         st      %g0,[$rp+64+16]
 961         st      %g0,[$rp+64+20]
 962         st      %g0,[$rp+64+24]
 963         st      %g0,[$rp+64+28]
 964         b       .Ladd_done
 965         nop
 966
 967 .align  16
 968 .Ladd_double:
 969         ldx     [%fp+STACK_BIAS-8],$rp_real
 970         mov     $ap_real,$ap
 971         b       .Lpoint_double_shortcut
 972         add     %sp,32*(12-4)+32,%sp    ! difference in frame sizes
 973
 974 .align  16
 975 .Ladd_proceed:
 976         add     %sp,LOCALS+$R,$bp
 977         add     %sp,LOCALS+$R,$ap
 978         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R);
 979         add     %sp,LOCALS+$Rsqr,$rp
 980
 981         add     $ap_real,64,$bp
 982         add     %sp,LOCALS+$H,$ap
 983         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z);
 984         add     %sp,LOCALS+$res_z,$rp
 985
 986         add     %sp,LOCALS+$H,$bp
 987         add     %sp,LOCALS+$H,$ap
 988         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H);
 989         add     %sp,LOCALS+$Hsqr,$rp
 990
 991         add     $bp_real,64,$bp
 992         add     %sp,LOCALS+$res_z,$ap
 993         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, res_z, in2_z);
 994         add     %sp,LOCALS+$res_z,$rp
 995
 996         add     %sp,LOCALS+$H,$bp
 997         add     %sp,LOCALS+$Hsqr,$ap
 998         call    __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H);
 999         add     %sp,LOCALS+$Hcub,$rp
1000
1001         add     %sp,LOCALS+$U1,$bp
1002         add     %sp,LOCALS+$Hsqr,$ap
1003         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, U1, Hsqr);
1004         add     %sp,LOCALS+$U2,$rp
1005
1006         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2);
1007         add     %sp,LOCALS+$Hsqr,$rp
1008
1009         add     %sp,LOCALS+$Rsqr,$bp
1010         call    __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr);
1011         add     %sp,LOCALS+$res_x,$rp
1012
1013         add     %sp,LOCALS+$Hcub,$bp
1014         call    __ecp_nistz256_sub_from !  p256_sub(res_x, res_x, Hcub);
1015         add     %sp,LOCALS+$res_x,$rp
1016
1017         add     %sp,LOCALS+$U2,$bp
1018         call    __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x);
1019         add     %sp,LOCALS+$res_y,$rp
1020
1021         add     %sp,LOCALS+$Hcub,$bp
1022         add     %sp,LOCALS+$S1,$ap
1023         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S1, Hcub);
1024         add     %sp,LOCALS+$S2,$rp
1025
1026         add     %sp,LOCALS+$R,$bp
1027         add     %sp,LOCALS+$res_y,$ap
1028         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R);
1029         add     %sp,LOCALS+$res_y,$rp
1030
1031         add     %sp,LOCALS+$S2,$bp
1032         call    __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2);
1033         add     %sp,LOCALS+$res_y,$rp
1034
1035         ld      [%fp+STACK_BIAS-16],$t1 ! !in1infty
1036         ld      [%fp+STACK_BIAS-12],$t2 ! !in2infty
1037         ldx     [%fp+STACK_BIAS-8],$rp
1038 ___
1039 for($i=0;$i<96;$i+=8) {                 # conditional moves
1040 $code.=<<___;
1041         ld      [%sp+LOCALS+$i],@acc[0]         ! res
1042         ld      [%sp+LOCALS+$i+4],@acc[1]
1043         ld      [$bp_real+$i],@acc[2]           ! in2
1044         ld      [$bp_real+$i+4],@acc[3]
1045         ld      [$ap_real+$i],@acc[4]           ! in1
1046         ld      [$ap_real+$i+4],@acc[5]
1047         movrz   $t1,@acc[2],@acc[0]
1048         movrz   $t1,@acc[3],@acc[1]
1049         movrz   $t2,@acc[4],@acc[0]
1050         movrz   $t2,@acc[5],@acc[1]
1051         st      @acc[0],[$rp+$i]
1052         st      @acc[1],[$rp+$i+4]
1053 ___
1054 }
1055 $code.=<<___;
1056 .Ladd_done:
1057         ret
1058         restore
1059 .type   ecp_nistz256_point_add,#function
1060 .size   ecp_nistz256_point_add,.-ecp_nistz256_point_add
1061 ___
1062 }
1063
1064 ########################################################################
1065 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1066 #                                    const P256_POINT_AFFINE *in2);
1067 {
1068 my ($res_x,$res_y,$res_z,
1069     $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1070 my $Z1sqr = $S2;
1071 # above map() describes stack layout with 10 temporary
1072 # 256-bit vectors on top. Then we reserve some space for
1073 # !in1infty, !in2infty, result of check for zero and return pointer.
1074
1075 my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1076 my $bp_real=$rp_real;
1077
1078 $code.=<<___;
1079 .globl  ecp_nistz256_point_add_affine
1080 .align  32
1081 ecp_nistz256_point_add_affine:
1082 #if 0
1083         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
1084         ld      [%g1],%g1               ! OPENSSL_sparcv9cap_P[0]
1085         and     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
1086         cmp     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
1087         be      ecp_nistz256_point_add_affine_vis3
1088         nop
1089 #endif
1090
1091         save    %sp,-STACK_FRAME-32*10-32,%sp
1092
1093         stx     $rp,[%fp+STACK_BIAS-8]  ! off-load $rp
1094         mov     $ap,$ap_real
1095         mov     $bp,$bp_real
1096
1097         ld      [$ap+64],$t0            ! in1_z
1098         ld      [$ap+64+4],$t1
1099         ld      [$ap+64+8],$t2
1100         ld      [$ap+64+12],$t3
1101         ld      [$ap+64+16],$t4
1102         ld      [$ap+64+20],$t5
1103         ld      [$ap+64+24],$t6
1104         ld      [$ap+64+28],$t7
1105         or      $t1,$t0,$t0
1106         or      $t3,$t2,$t2
1107         or      $t5,$t4,$t4
1108         or      $t7,$t6,$t6
1109         or      $t2,$t0,$t0
1110         or      $t6,$t4,$t4
1111         or      $t4,$t0,$t0             ! !in1infty
1112         movrnz  $t0,-1,$t0
1113         st      $t0,[%fp+STACK_BIAS-16]
1114
1115         ld      [$bp],@acc[0]           ! in2_x
1116         ld      [$bp+4],@acc[1]
1117         ld      [$bp+8],@acc[2]
1118         ld      [$bp+12],@acc[3]
1119         ld      [$bp+16],@acc[4]
1120         ld      [$bp+20],@acc[5]
1121         ld      [$bp+24],@acc[6]
1122         ld      [$bp+28],@acc[7]
1123         ld      [$bp+32],$t0            ! in2_y
1124         ld      [$bp+32+4],$t1
1125         ld      [$bp+32+8],$t2
1126         ld      [$bp+32+12],$t3
1127         ld      [$bp+32+16],$t4
1128         ld      [$bp+32+20],$t5
1129         ld      [$bp+32+24],$t6
1130         ld      [$bp+32+28],$t7
1131         or      @acc[1],@acc[0],@acc[0]
1132         or      @acc[3],@acc[2],@acc[2]
1133         or      @acc[5],@acc[4],@acc[4]
1134         or      @acc[7],@acc[6],@acc[6]
1135         or      @acc[2],@acc[0],@acc[0]
1136         or      @acc[6],@acc[4],@acc[4]
1137         or      @acc[4],@acc[0],@acc[0]
1138         or      $t1,$t0,$t0
1139         or      $t3,$t2,$t2
1140         or      $t5,$t4,$t4
1141         or      $t7,$t6,$t6
1142         or      $t2,$t0,$t0
1143         or      $t6,$t4,$t4
1144         or      $t4,$t0,$t0
1145         or      @acc[0],$t0,$t0         ! !in2infty
1146         movrnz  $t0,-1,$t0
1147         st      $t0,[%fp+STACK_BIAS-12]
1148
1149         add     $ap_real,64,$bp
1150         add     $ap_real,64,$ap
1151         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z);
1152         add     %sp,LOCALS+$Z1sqr,$rp
1153
1154         add     $bp_real,0,$bp
1155         add     %sp,LOCALS+$Z1sqr,$ap
1156         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, Z1sqr, in2_x);
1157         add     %sp,LOCALS+$U2,$rp
1158
1159         add     $ap_real,0,$bp
1160         call    __ecp_nistz256_sub_from ! p256_sub(H, U2, in1_x);
1161         add     %sp,LOCALS+$H,$rp
1162
1163         add     $ap_real,64,$bp
1164         add     %sp,LOCALS+$Z1sqr,$ap
1165         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z);
1166         add     %sp,LOCALS+$S2,$rp
1167
1168         add     $ap_real,64,$bp
1169         add     %sp,LOCALS+$H,$ap
1170         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z);
1171         add     %sp,LOCALS+$res_z,$rp
1172
1173         add     $bp_real,32,$bp
1174         add     %sp,LOCALS+$S2,$ap
1175         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y);
1176         add     %sp,LOCALS+$S2,$rp
1177
1178         add     $ap_real,32,$bp
1179         call    __ecp_nistz256_sub_from ! p256_sub(R, S2, in1_y);
1180         add     %sp,LOCALS+$R,$rp
1181
1182         add     %sp,LOCALS+$H,$bp
1183         add     %sp,LOCALS+$H,$ap
1184         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H);
1185         add     %sp,LOCALS+$Hsqr,$rp
1186
1187         add     %sp,LOCALS+$R,$bp
1188         add     %sp,LOCALS+$R,$ap
1189         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R);
1190         add     %sp,LOCALS+$Rsqr,$rp
1191
1192         add     %sp,LOCALS+$H,$bp
1193         add     %sp,LOCALS+$Hsqr,$ap
1194         call    __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H);
1195         add     %sp,LOCALS+$Hcub,$rp
1196
1197         add     $ap_real,0,$bp
1198         add     %sp,LOCALS+$Hsqr,$ap
1199         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in1_x, Hsqr);
1200         add     %sp,LOCALS+$U2,$rp
1201
1202         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2);
1203         add     %sp,LOCALS+$Hsqr,$rp
1204
1205         add     %sp,LOCALS+$Rsqr,$bp
1206         call    __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr);
1207         add     %sp,LOCALS+$res_x,$rp
1208
1209         add     %sp,LOCALS+$Hcub,$bp
1210         call    __ecp_nistz256_sub_from !  p256_sub(res_x, res_x, Hcub);
1211         add     %sp,LOCALS+$res_x,$rp
1212
1213         add     %sp,LOCALS+$U2,$bp
1214         call    __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x);
1215         add     %sp,LOCALS+$res_y,$rp
1216
1217         add     $ap_real,32,$bp
1218         add     %sp,LOCALS+$Hcub,$ap
1219         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, in1_y, Hcub);
1220         add     %sp,LOCALS+$S2,$rp
1221
1222         add     %sp,LOCALS+$R,$bp
1223         add     %sp,LOCALS+$res_y,$ap
1224         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R);
1225         add     %sp,LOCALS+$res_y,$rp
1226
1227         add     %sp,LOCALS+$S2,$bp
1228         call    __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2);
1229         add     %sp,LOCALS+$res_y,$rp
1230
1231         ld      [%fp+STACK_BIAS-16],$t1 ! !in1infty
1232         ld      [%fp+STACK_BIAS-12],$t2 ! !in2infty
1233         ldx     [%fp+STACK_BIAS-8],$rp
1234 ___
1235 for($i=0;$i<64;$i+=8) {                 # conditional moves
1236 $code.=<<___;
1237         ld      [%sp+LOCALS+$i],@acc[0]         ! res
1238         ld      [%sp+LOCALS+$i+4],@acc[1]
1239         ld      [$bp_real+$i],@acc[2]           ! in2
1240         ld      [$bp_real+$i+4],@acc[3]
1241         ld      [$ap_real+$i],@acc[4]           ! in1
1242         ld      [$ap_real+$i+4],@acc[5]
1243         movrz   $t1,@acc[2],@acc[0]
1244         movrz   $t1,@acc[3],@acc[1]
1245         movrz   $t2,@acc[4],@acc[0]
1246         movrz   $t2,@acc[5],@acc[1]
1247         st      @acc[0],[$rp+$i]
1248         st      @acc[1],[$rp+$i+4]
1249 ___
1250 }
1251 for(;$i<96;$i+=8) {
1252 my $j=($i-64)/4;
1253 $code.=<<___;
1254         ld      [%sp+LOCALS+$i],@acc[0]         ! res
1255         ld      [%sp+LOCALS+$i+4],@acc[1]
1256         ld      [$ap_real+$i],@acc[4]           ! in1
1257         ld      [$ap_real+$i+4],@acc[5]
1258         movrz   $t1,@ONE_mont[$j],@acc[0]
1259         movrz   $t1,@ONE_mont[$j+1],@acc[1]
1260         movrz   $t2,@acc[4],@acc[0]
1261         movrz   $t2,@acc[5],@acc[1]
1262         st      @acc[0],[$rp+$i]
1263         st      @acc[1],[$rp+$i+4]
1264 ___
1265 }
1266 $code.=<<___;
1267         ret
1268         restore
1269 .type   ecp_nistz256_point_add_affine,#function
1270 .size   ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1271 ___
1272 }                                                               }}}
1273 {{{
1274 my ($out,$inp,$index)=map("%i$_",(0..2));
1275 my $mask="%o0";
1276
1277 $code.=<<___;
1278 ! void  ecp_nistz256_select_w5(P256_POINT *%i0,const void *%i1,
1279 !                                              int %i2);
1280 .globl  ecp_nistz256_select_w5
1281 .align  32
1282 ecp_nistz256_select_w5:
1283         save    %sp,-STACK_FRAME,%sp
1284
1285         neg     $index,$mask
1286         srax    $mask,63,$mask
1287
1288         add     $index,$mask,$index
1289         sll     $index,2,$index
1290         add     $inp,$index,$inp
1291
1292         ld      [$inp+64*0],%l0
1293         ld      [$inp+64*1],%l1
1294         ld      [$inp+64*2],%l2
1295         ld      [$inp+64*3],%l3
1296         ld      [$inp+64*4],%l4
1297         ld      [$inp+64*5],%l5
1298         ld      [$inp+64*6],%l6
1299         ld      [$inp+64*7],%l7
1300         add     $inp,64*8,$inp
1301         and     %l0,$mask,%l0
1302         and     %l1,$mask,%l1
1303         st      %l0,[$out]              ! X
1304         and     %l2,$mask,%l2
1305         st      %l1,[$out+4]
1306         and     %l3,$mask,%l3
1307         st      %l2,[$out+8]
1308         and     %l4,$mask,%l4
1309         st      %l3,[$out+12]
1310         and     %l5,$mask,%l5
1311         st      %l4,[$out+16]
1312         and     %l6,$mask,%l6
1313         st      %l5,[$out+20]
1314         and     %l7,$mask,%l7
1315         st      %l6,[$out+24]
1316         st      %l7,[$out+28]
1317         add     $out,32,$out
1318
1319         ld      [$inp+64*0],%l0
1320         ld      [$inp+64*1],%l1
1321         ld      [$inp+64*2],%l2
1322         ld      [$inp+64*3],%l3
1323         ld      [$inp+64*4],%l4
1324         ld      [$inp+64*5],%l5
1325         ld      [$inp+64*6],%l6
1326         ld      [$inp+64*7],%l7
1327         add     $inp,64*8,$inp
1328         and     %l0,$mask,%l0
1329         and     %l1,$mask,%l1
1330         st      %l0,[$out]              ! Y
1331         and     %l2,$mask,%l2
1332         st      %l1,[$out+4]
1333         and     %l3,$mask,%l3
1334         st      %l2,[$out+8]
1335         and     %l4,$mask,%l4
1336         st      %l3,[$out+12]
1337         and     %l5,$mask,%l5
1338         st      %l4,[$out+16]
1339         and     %l6,$mask,%l6
1340         st      %l5,[$out+20]
1341         and     %l7,$mask,%l7
1342         st      %l6,[$out+24]
1343         st      %l7,[$out+28]
1344         add     $out,32,$out
1345
1346         ld      [$inp+64*0],%l0
1347         ld      [$inp+64*1],%l1
1348         ld      [$inp+64*2],%l2
1349         ld      [$inp+64*3],%l3
1350         ld      [$inp+64*4],%l4
1351         ld      [$inp+64*5],%l5
1352         ld      [$inp+64*6],%l6
1353         ld      [$inp+64*7],%l7
1354         and     %l0,$mask,%l0
1355         and     %l1,$mask,%l1
1356         st      %l0,[$out]              ! Z
1357         and     %l2,$mask,%l2
1358         st      %l1,[$out+4]
1359         and     %l3,$mask,%l3
1360         st      %l2,[$out+8]
1361         and     %l4,$mask,%l4
1362         st      %l3,[$out+12]
1363         and     %l5,$mask,%l5
1364         st      %l4,[$out+16]
1365         and     %l6,$mask,%l6
1366         st      %l5,[$out+20]
1367         and     %l7,$mask,%l7
1368         st      %l6,[$out+24]
1369         st      %l7,[$out+28]
1370
1371         ret
1372         restore
1373 .type   ecp_nistz256_select_w5,#function
1374 .size   ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
1375
1376 ! void  ecp_nistz256_select_w7(P256_POINT_AFFINE *%i0,const void *%i1,
1377 !                                                     int %i2);
1378 .globl  ecp_nistz256_select_w7
1379 .align  32
1380 ecp_nistz256_select_w7:
1381         save    %sp,-STACK_FRAME,%sp
1382
1383         neg     $index,$mask
1384         srax    $mask,63,$mask
1385
1386         add     $index,$mask,$index
1387         add     $inp,$index,$inp
1388         mov     64/4,$index
1389
1390 .Loop_select_w7:
1391         ldub    [$inp+64*0],%l0
1392         prefetch [$inp+3840+64*0],1
1393         subcc   $index,1,$index
1394         ldub    [$inp+64*1],%l1
1395         prefetch [$inp+3840+64*1],1
1396         ldub    [$inp+64*2],%l2
1397         prefetch [$inp+3840+64*2],1
1398         ldub    [$inp+64*3],%l3
1399         prefetch [$inp+3840+64*3],1
1400         add     $inp,64*4,$inp
1401         sll     %l1,8,%l1
1402         sll     %l2,16,%l2
1403         or      %l0,%l1,%l0
1404         sll     %l3,24,%l3
1405         or      %l0,%l2,%l0
1406         or      %l0,%l3,%l0
1407         and     %l0,$mask,%l0
1408         st      %l0,[$out]
1409         bne     .Loop_select_w7
1410         add     $out,4,$out
1411
1412         ret
1413         restore
1414 .type   ecp_nistz256_select_w7,#function
1415 .size   ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
1416 ___
1417 }}}
1418 {{{
1419 ########################################################################
1420 # Following subroutines are VIS3 counterparts of those above that
1421 # implement ones found in ecp_nistz256.c. Key difference is that they
1422 # use 128-bit muliplication and addition with 64-bit carry, and in order
1423 # to do that they perform conversion from uin32_t[8] to uint64_t[4] upon
1424 # entry and vice versa on return.
1425 #
1426 my ($rp,$ap,$bp)=map("%i$_",(0..2));
1427 my ($t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("%l$_",(0..7));
1428 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5)=map("%o$_",(0..5));
1429 my ($bi,$poly1,$poly3,$minus1)=(map("%i$_",(3..5)),"%g1");
1430 my ($rp_real,$ap_real)=("%g2","%g3");
1431 my ($acc6,$acc7)=($bp,$bi);     # used in squaring
1432
1433 $code.=<<___;
1434 #if 0
1435 .align  32
1436 __ecp_nistz256_mul_by_2_vis3:
1437         addcc   $acc0,$acc0,$acc0
1438         addxccc $acc1,$acc1,$acc1
1439         addxccc $acc2,$acc2,$acc2
1440         addxccc $acc3,$acc3,$acc3
1441         b       .Lreduce_by_sub_vis3
1442         addxc   %g0,%g0,$acc4           ! did it carry?
1443 .type   __ecp_nistz256_mul_by_2_vis3,#function
1444 .size   __ecp_nistz256_mul_by_2_vis3,.-__ecp_nistz256_mul_by_2_vis3
1445
1446 .align  32
1447 __ecp_nistz256_add_vis3:
1448         ldx     [$bp+0],$t0
1449         ldx     [$bp+8],$t1
1450         ldx     [$bp+16],$t2
1451         ldx     [$bp+24],$t3
1452
1453 __ecp_nistz256_add_noload_vis3:
1454
1455         addcc   $t0,$acc0,$acc0
1456         addxccc $t1,$acc1,$acc1
1457         addxccc $t2,$acc2,$acc2
1458         addxccc $t3,$acc3,$acc3
1459         addxc   %g0,%g0,$acc4           ! did it carry?
1460
1461 .Lreduce_by_sub_vis3:
1462
1463         addcc   $acc0,1,$t0             ! add -modulus, i.e. subtract
1464         addxccc $acc1,$poly1,$t1
1465         addxccc $acc2,$minus1,$t2
1466         addxccc $acc3,$poly3,$t3
1467         addxc   $acc4,$minus1,$acc4
1468
1469         movrz   $acc4,$t0,$acc0         ! ret = borrow ? ret : ret-modulus
1470         movrz   $acc4,$t1,$acc1
1471         stx     $acc0,[$rp]
1472         movrz   $acc4,$t2,$acc2
1473         stx     $acc1,[$rp+8]
1474         movrz   $acc4,$t3,$acc3
1475         stx     $acc2,[$rp+16]
1476         retl
1477         stx     $acc3,[$rp+24]
1478 .type   __ecp_nistz256_add_vis3,#function
1479 .size   __ecp_nistz256_add_vis3,.-__ecp_nistz256_add_vis3
1480
1481 ! Trouble with subtraction is that there is no subtraction with 64-bit
1482 ! borrow, only with 32-bit one. For this reason we "decompose" 64-bit
1483 ! $acc0-$acc3 to 32-bit values and pick b[4] in 32-bit pieces. But
1484 ! recall that SPARC is big-endian, which is why you'll observe that
1485 ! b[4] is accessed as 4-0-12-8-20-16-28-24. And prior reduction we
1486 ! "collect" result back to 64-bit $acc0-$acc3.
1487 .align  32
1488 __ecp_nistz256_sub_from_vis3:
1489         ld      [$bp+4],$t0
1490         ld      [$bp+0],$t1
1491         ld      [$bp+12],$t2
1492         ld      [$bp+8],$t3
1493
1494         srlx    $acc0,32,$acc4
1495         not     $poly1,$poly1
1496         srlx    $acc1,32,$acc5
1497         subcc   $acc0,$t0,$acc0
1498         ld      [$bp+20],$t0
1499         subccc  $acc4,$t1,$acc4
1500         ld      [$bp+16],$t1
1501         subccc  $acc1,$t2,$acc1
1502         ld      [$bp+28],$t2
1503         and     $acc0,$poly1,$acc0
1504         subccc  $acc5,$t3,$acc5
1505         ld      [$bp+24],$t3
1506         sllx    $acc4,32,$acc4
1507         and     $acc1,$poly1,$acc1
1508         sllx    $acc5,32,$acc5
1509         or      $acc0,$acc4,$acc0
1510         srlx    $acc2,32,$acc4
1511         or      $acc1,$acc5,$acc1
1512         srlx    $acc3,32,$acc5
1513         subccc  $acc2,$t0,$acc2
1514         subccc  $acc4,$t1,$acc4
1515         subccc  $acc3,$t2,$acc3
1516         and     $acc2,$poly1,$acc2
1517         subccc  $acc5,$t3,$acc5
1518         sllx    $acc4,32,$acc4
1519         and     $acc3,$poly1,$acc3
1520         sllx    $acc5,32,$acc5
1521         or      $acc2,$acc4,$acc2
1522         subc    %g0,%g0,$acc4           ! did it borrow?
1523         b       .Lreduce_by_add_vis3
1524         or      $acc3,$acc5,$acc3
1525 .type   __ecp_nistz256_sub_from_vis3,#function
1526 .size   __ecp_nistz256_sub_from_vis3,.-__ecp_nistz256_sub_from_vis3
1527
1528 .align  32
1529 __ecp_nistz256_sub_morf_vis3:
1530         ld      [$bp+4],$t0
1531         ld      [$bp+0],$t1
1532         ld      [$bp+12],$t2
1533         ld      [$bp+8],$t3
1534
1535         srlx    $acc0,32,$acc4
1536         not     $poly1,$poly1
1537         srlx    $acc1,32,$acc5
1538         subcc   $t0,$acc0,$acc0
1539         ld      [$bp+20],$t0
1540         subccc  $t1,$acc4,$acc4
1541         ld      [$bp+16],$t1
1542         subccc  $t2,$acc1,$acc1
1543         ld      [$bp+28],$t2
1544         and     $acc0,$poly1,$acc0
1545         subccc  $t3,$acc5,$acc5
1546         ld      [$bp+24],$t3
1547         sllx    $acc4,32,$acc4
1548         and     $acc1,$poly1,$acc1
1549         sllx    $acc5,32,$acc5
1550         or      $acc0,$acc4,$acc0
1551         srlx    $acc2,32,$acc4
1552         or      $acc1,$acc5,$acc1
1553         srlx    $acc3,32,$acc5
1554         subccc  $t0,$acc2,$acc2
1555         subccc  $t1,$acc4,$acc4
1556         subccc  $t2,$acc3,$acc3
1557         and     $acc2,$poly1,$acc2
1558         subccc  $t3,$acc5,$acc5
1559         sllx    $acc4,32,$acc4
1560         and     $acc3,$poly1,$acc3
1561         sllx    $acc5,32,$acc5
1562         or      $acc2,$acc4,$acc2
1563         subc    %g0,%g0,$acc4           ! did it borrow?
1564         or      $acc3,$acc5,$acc3
1565
1566 .Lreduce_by_add_vis3:
1567
1568         addcc   $acc0,-1,$t0            ! add modulus
1569         not     $poly3,$t3
1570         addxccc $acc1,$poly1,$t1
1571         not     $poly1,$poly1           ! restore $poly1
1572         addxccc $acc2,%g0,$t2
1573         addxc   $acc3,$t3,$t3
1574
1575         movrnz  $acc4,$t0,$acc0         ! if a-b borrowed, ret = ret+mod
1576         movrnz  $acc4,$t1,$acc1
1577         stx     $acc0,[$rp]
1578         movrnz  $acc4,$t2,$acc2
1579         stx     $acc1,[$rp+8]
1580         movrnz  $acc4,$t3,$acc3
1581         stx     $acc2,[$rp+16]
1582         retl
1583         stx     $acc3,[$rp+24]
1584 .type   __ecp_nistz256_sub_morf_vis3,#function
1585 .size   __ecp_nistz256_sub_morf_vis3,.-__ecp_nistz256_sub_morf_vis3
1586
1587 .align  32
1588 __ecp_nistz256_div_by_2_vis3:
1589         ! ret = (a is odd ? a+mod : a) >> 1
1590
1591         not     $poly1,$t1
1592         not     $poly3,$t3
1593         and     $acc0,1,$acc5
1594         addcc   $acc0,-1,$t0            ! add modulus
1595         addxccc $acc1,$t1,$t1
1596         addxccc $acc2,%g0,$t2
1597         addxccc $acc3,$t3,$t3
1598         addxc   %g0,%g0,$acc4           ! carry bit
1599
1600         movrnz  $acc5,$t0,$acc0
1601         movrnz  $acc5,$t1,$acc1
1602         movrnz  $acc5,$t2,$acc2
1603         movrnz  $acc5,$t3,$acc3
1604         movrz   $acc5,%g0,$acc4
1605
1606         ! ret >>= 1
1607
1608         srlx    $acc0,1,$acc0
1609         sllx    $acc1,63,$t0
1610         srlx    $acc1,1,$acc1
1611         or      $acc0,$t0,$acc0
1612         sllx    $acc2,63,$t1
1613         srlx    $acc2,1,$acc2
1614         or      $acc1,$t1,$acc1
1615         sllx    $acc3,63,$t2
1616         stx     $acc0,[$rp]
1617         srlx    $acc3,1,$acc3
1618         or      $acc2,$t2,$acc2
1619         sllx    $acc4,63,$t3            ! don't forget carry bit
1620         stx     $acc1,[$rp+8]
1621         or      $acc3,$t3,$acc3
1622         stx     $acc2,[$rp+16]
1623         retl
1624         stx     $acc3,[$rp+24]
1625 .type   __ecp_nistz256_div_by_2_vis3,#function
1626 .size   __ecp_nistz256_div_by_2_vis3,.-__ecp_nistz256_div_by_2_vis3
1627
1628 ! compared to __ecp_nistz256_mul_mont it's almost 4x smaller and
1629 ! 4x faster [on T4]...
1630 .align  32
1631 __ecp_nistz256_mul_mont_vis3:
1632         mulx    $a0,$bi,$acc0
1633         not     $poly3,$poly3           ! 0xFFFFFFFF00000001
1634         umulxhi $a0,$bi,$t0
1635         mulx    $a1,$bi,$acc1
1636         umulxhi $a1,$bi,$t1
1637         mulx    $a2,$bi,$acc2
1638         umulxhi $a2,$bi,$t2
1639         mulx    $a3,$bi,$acc3
1640         umulxhi $a3,$bi,$t3
1641         ldx     [$bp+8],$bi             ! b[1]
1642
1643         addcc   $acc1,$t0,$acc1         ! accumulate high parts of multiplication
1644          sllx   $acc0,32,$t0
1645         addxccc $acc2,$t1,$acc2
1646          srlx   $acc0,32,$t1
1647         addxccc $acc3,$t2,$acc3
1648         addxc   %g0,$t3,$acc4
1649         mov     0,$acc5
1650 ___
1651 for($i=1;$i<4;$i++) {
1652         # Reduction iteration is normally performed by accumulating
1653         # result of multiplication of modulus by "magic" digit [and
1654         # omitting least significant word, which is guaranteed to
1655         # be 0], but thanks to special form of modulus and "magic"
1656         # digit being equal to least significant word, it can be
1657         # performed with additions and subtractions alone. Indeed:
1658         #
1659         #            ffff0001.00000000.0000ffff.ffffffff
1660         # *                                     abcdefgh
1661         # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1662         #
1663         # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1664         # rewrite above as:
1665         #
1666         #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1667         # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
1668         # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
1669         #
1670         # or marking redundant operations:
1671         #
1672         #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
1673         # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
1674         # - 0000abcd.efgh0000.--------.--------.--------
1675         #   ^^^^^^^^ but this word is calculated with umulxhi, because
1676         #            there is no subtract with 64-bit borrow:-(
1677
1678 $code.=<<___;
1679         sub     $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
1680         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
1681         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
1682         mulx    $a0,$bi,$t0
1683         addxccc $acc2,$t1,$acc1
1684         mulx    $a1,$bi,$t1
1685         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
1686         mulx    $a2,$bi,$t2
1687         addxccc $acc4,$t3,$acc3
1688         mulx    $a3,$bi,$t3
1689         addxc   $acc5,%g0,$acc4
1690
1691         addcc   $acc0,$t0,$acc0         ! accumulate low parts of multiplication
1692         umulxhi $a0,$bi,$t0
1693         addxccc $acc1,$t1,$acc1
1694         umulxhi $a1,$bi,$t1
1695         addxccc $acc2,$t2,$acc2
1696         umulxhi $a2,$bi,$t2
1697         addxccc $acc3,$t3,$acc3
1698         umulxhi $a3,$bi,$t3
1699         addxc   $acc4,%g0,$acc4
1700 ___
1701 $code.=<<___    if ($i<3);
1702         ldx     [$bp+8*($i+1)],$bi      ! bp[$i+1]
1703 ___
1704 $code.=<<___;
1705         addcc   $acc1,$t0,$acc1         ! accumulate high parts of multiplication
1706          sllx   $acc0,32,$t0
1707         addxccc $acc2,$t1,$acc2
1708          srlx   $acc0,32,$t1
1709         addxccc $acc3,$t2,$acc3
1710         addxccc $acc4,$t3,$acc4
1711         addxc   %g0,%g0,$acc5
1712 ___
1713 }
1714 $code.=<<___;
1715         sub     $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
1716         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
1717         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
1718         addxccc $acc2,$t1,$acc1
1719         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
1720         addxccc $acc4,$t3,$acc3
1721         b       .Lmul_final_vis3        ! see below
1722         addxc   $acc5,%g0,$acc4
1723 .type   __ecp_nistz256_mul_mont_vis3,#function
1724 .size   __ecp_nistz256_mul_mont_vis3,.-__ecp_nistz256_mul_mont_vis3
1725
1726 ! compared to above __ecp_nistz256_mul_mont_vis3 it's 21% less
1727 ! instructions, but only 14% faster [on T4]...
1728 .align  32
1729 __ecp_nistz256_sqr_mont_vis3:
1730         !  |  |  |  |  |  |a1*a0|  |
1731         !  |  |  |  |  |a2*a0|  |  |
1732         !  |  |a3*a2|a3*a0|  |  |  |
1733         !  |  |  |  |a2*a1|  |  |  |
1734         !  |  |  |a3*a1|  |  |  |  |
1735         ! *|  |  |  |  |  |  |  | 2|
1736         ! +|a3*a3|a2*a2|a1*a1|a0*a0|
1737         !  |--+--+--+--+--+--+--+--|
1738         !  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1739         !
1740         !  "can't overflow" below mark carrying into high part of
1741         !  multiplication result, which can't overflow, because it
1742         !  can never be all ones.
1743
1744         mulx    $a1,$a0,$acc1           ! a[1]*a[0]
1745         umulxhi $a1,$a0,$t1
1746         mulx    $a2,$a0,$acc2           ! a[2]*a[0]
1747         umulxhi $a2,$a0,$t2
1748         mulx    $a3,$a0,$acc3           ! a[3]*a[0]
1749         umulxhi $a3,$a0,$acc4
1750
1751         addcc   $acc2,$t1,$acc2         ! accumulate high parts of multiplication
1752         mulx    $a2,$a1,$t0             ! a[2]*a[1]
1753         umulxhi $a2,$a1,$t1
1754         addxccc $acc3,$t2,$acc3
1755         mulx    $a3,$a1,$t2             ! a[3]*a[1]
1756         umulxhi $a3,$a1,$t3
1757         addxc   $acc4,%g0,$acc4         ! can't overflow
1758
1759         mulx    $a3,$a2,$acc5           ! a[3]*a[2]
1760         not     $poly3,$poly3           ! 0xFFFFFFFF00000001
1761         umulxhi $a3,$a2,$acc6
1762
1763         addcc   $t2,$t1,$t1             ! accumulate high parts of multiplication
1764         mulx    $a0,$a0,$acc0           ! a[0]*a[0]
1765         addxc   $t3,%g0,$t2             ! can't overflow
1766
1767         addcc   $acc3,$t0,$acc3         ! accumulate low parts of multiplication
1768         umulxhi $a0,$a0,$a0
1769         addxccc $acc4,$t1,$acc4
1770         mulx    $a1,$a1,$t1             ! a[1]*a[1]
1771         addxccc $acc5,$t2,$acc5
1772         umulxhi $a1,$a1,$a1
1773         addxc   $acc6,%g0,$acc6         ! can't overflow
1774
1775         addcc   $acc1,$acc1,$acc1       ! acc[1-6]*=2
1776         mulx    $a2,$a2,$t2             ! a[2]*a[2]
1777         addxccc $acc2,$acc2,$acc2
1778         umulxhi $a2,$a2,$a2
1779         addxccc $acc3,$acc3,$acc3
1780         mulx    $a3,$a3,$t3             ! a[3]*a[3]
1781         addxccc $acc4,$acc4,$acc4
1782         umulxhi $a3,$a3,$a3
1783         addxccc $acc5,$acc5,$acc5
1784         addxccc $acc6,$acc6,$acc6
1785         addxc   %g0,%g0,$acc7
1786
1787         addcc   $acc1,$a0,$acc1         ! +a[i]*a[i]
1788         addxccc $acc2,$t1,$acc2
1789         addxccc $acc3,$a1,$acc3
1790         addxccc $acc4,$t2,$acc4
1791          sllx   $acc0,32,$t0
1792         addxccc $acc5,$a2,$acc5
1793          srlx   $acc0,32,$t1
1794         addxccc $acc6,$t3,$acc6
1795          sub    $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
1796         addxc   $acc7,$a3,$acc7
1797 ___
1798 for($i=0;$i<3;$i++) {                   # reductions, see commentary
1799                                         # in multiplication for details
1800 $code.=<<___;
1801         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
1802         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
1803          sllx   $acc0,32,$t0
1804         addxccc $acc2,$t1,$acc1
1805          srlx   $acc0,32,$t1
1806         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
1807          sub    $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
1808         addxc   %g0,$t3,$acc3           ! cant't overflow
1809 ___
1810 }
1811 $code.=<<___;
1812         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
1813         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
1814         addxccc $acc2,$t1,$acc1
1815         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
1816         addxc   %g0,$t3,$acc3           ! can't overflow
1817
1818         addcc   $acc0,$acc4,$acc0       ! accumulate upper half
1819         addxccc $acc1,$acc5,$acc1
1820         addxccc $acc2,$acc6,$acc2
1821         addxccc $acc3,$acc7,$acc3
1822         addxc   %g0,%g0,$acc4
1823
1824 .Lmul_final_vis3:
1825
1826         ! Final step is "if result > mod, subtract mod", but as comparison
1827         ! means subtraction, we do the subtraction and then copy outcome
1828         ! if it didn't borrow. But note that as we [have to] replace
1829         ! subtraction with addition with negative, carry/borrow logic is
1830         ! inverse.
1831
1832         addcc   $acc0,1,$t0             ! add -modulus, i.e. subtract
1833         not     $poly3,$poly3           ! restore 0x00000000FFFFFFFE
1834         addxccc $acc1,$poly1,$t1
1835         addxccc $acc2,$minus1,$t2
1836         addxccc $acc3,$poly3,$t3
1837         addxccc $acc4,$minus1,%g0       ! did it carry?
1838
1839         movcs   %xcc,$t0,$acc0
1840         movcs   %xcc,$t1,$acc1
1841         stx     $acc0,[$rp]
1842         movcs   %xcc,$t2,$acc2
1843         stx     $acc1,[$rp+8]
1844         movcs   %xcc,$t3,$acc3
1845         stx     $acc2,[$rp+16]
1846         retl
1847         stx     $acc3,[$rp+24]
1848 .type   __ecp_nistz256_sqr_mont_vis3,#function
1849 .size   __ecp_nistz256_sqr_mont_vis3,.-__ecp_nistz256_sqr_mont_vis3
1850 ___
1851
1852 ########################################################################
1853 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
1854 #
1855 {
1856 my ($res_x,$res_y,$res_z,
1857     $in_x,$in_y,$in_z,
1858     $S,$M,$Zsqr,$tmp0)=map(32*$_,(0..9));
1859 # above map() describes stack layout with 10 temporary
1860 # 256-bit vectors on top.
1861
1862 $code.=<<___;
1863 .align  32
1864 ecp_nistz256_point_double_vis3:
1865         save    %sp,-STACK64_FRAME-32*10,%sp
1866
1867         mov     $rp,$rp_real
1868 .Ldouble_shortcut_vis3:
1869         mov     -1,$minus1
1870         mov     -2,$poly3
1871         sllx    $minus1,32,$poly1               ! 0xFFFFFFFF00000000
1872         srl     $poly3,0,$poly3                 ! 0x00000000FFFFFFFE
1873
1874         ! convert input to uint64_t[4]
1875         ld      [$ap],$a0                       ! in_x
1876         ld      [$ap+4],$t0
1877         ld      [$ap+8],$a1
1878         ld      [$ap+12],$t1
1879         ld      [$ap+16],$a2
1880         ld      [$ap+20],$t2
1881         ld      [$ap+24],$a3
1882         ld      [$ap+28],$t3
1883         sllx    $t0,32,$t0
1884         sllx    $t1,32,$t1
1885         ld      [$ap+32],$acc0                  ! in_y
1886         or      $a0,$t0,$a0
1887         ld      [$ap+32+4],$t0
1888         sllx    $t2,32,$t2
1889         ld      [$ap+32+8],$acc1
1890         or      $a1,$t1,$a1
1891         ld      [$ap+32+12],$t1
1892         sllx    $t3,32,$t3
1893         ld      [$ap+32+16],$acc2
1894         or      $a2,$t2,$a2
1895         ld      [$ap+32+20],$t2
1896         or      $a3,$t3,$a3
1897         ld      [$ap+32+24],$acc3
1898         sllx    $t0,32,$t0
1899         ld      [$ap+32+28],$t3
1900         sllx    $t1,32,$t1
1901         stx     $a0,[%sp+LOCALS64+$in_x]
1902         sllx    $t2,32,$t2
1903         stx     $a1,[%sp+LOCALS64+$in_x+8]
1904         sllx    $t3,32,$t3
1905         stx     $a2,[%sp+LOCALS64+$in_x+16]
1906         or      $acc0,$t0,$acc0
1907         stx     $a3,[%sp+LOCALS64+$in_x+24]
1908         or      $acc1,$t1,$acc1
1909         stx     $acc0,[%sp+LOCALS64+$in_y]
1910         or      $acc2,$t2,$acc2
1911         stx     $acc1,[%sp+LOCALS64+$in_y+8]
1912         or      $acc3,$t3,$acc3
1913         stx     $acc2,[%sp+LOCALS64+$in_y+16]
1914         stx     $acc3,[%sp+LOCALS64+$in_y+24]
1915
1916         ld      [$ap+64],$a0                    ! in_z
1917         ld      [$ap+64+4],$t0
1918         ld      [$ap+64+8],$a1
1919         ld      [$ap+64+12],$t1
1920         ld      [$ap+64+16],$a2
1921         ld      [$ap+64+20],$t2
1922         ld      [$ap+64+24],$a3
1923         ld      [$ap+64+28],$t3
1924         sllx    $t0,32,$t0
1925         sllx    $t1,32,$t1
1926         or      $a0,$t0,$a0
1927         sllx    $t2,32,$t2
1928         or      $a1,$t1,$a1
1929         sllx    $t3,32,$t3
1930         or      $a2,$t2,$a2
1931         or      $a3,$t3,$a3
1932         sllx    $t0,32,$t0
1933         sllx    $t1,32,$t1
1934         stx     $a0,[%sp+LOCALS64+$in_z]
1935         sllx    $t2,32,$t2
1936         stx     $a1,[%sp+LOCALS64+$in_z+8]
1937         sllx    $t3,32,$t3
1938         stx     $a2,[%sp+LOCALS64+$in_z+16]
1939         stx     $a3,[%sp+LOCALS64+$in_z+24]
1940
1941         ! in_y is still in $acc0-$acc3
1942         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(S, in_y);
1943         add     %sp,LOCALS64+$S,$rp
1944
1945         ! in_z is still in $a0-$a3
1946         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Zsqr, in_z);
1947         add     %sp,LOCALS64+$Zsqr,$rp
1948
1949         mov     $acc0,$a0                       ! put Zsqr aside
1950         mov     $acc1,$a1
1951         mov     $acc2,$a2
1952         mov     $acc3,$a3
1953
1954         add     %sp,LOCALS64+$in_x,$bp
1955         call    __ecp_nistz256_add_vis3         ! p256_add(M, Zsqr, in_x);
1956         add     %sp,LOCALS64+$M,$rp
1957
1958         mov     $a0,$acc0                       ! restore Zsqr
1959         ldx     [%sp+LOCALS64+$S],$a0           ! forward load
1960         mov     $a1,$acc1
1961         ldx     [%sp+LOCALS64+$S+8],$a1
1962         mov     $a2,$acc2
1963         ldx     [%sp+LOCALS64+$S+16],$a2
1964         mov     $a3,$acc3
1965         ldx     [%sp+LOCALS64+$S+24],$a3
1966
1967         add     %sp,LOCALS64+$in_x,$bp
1968         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(Zsqr, in_x, Zsqr);
1969         add     %sp,LOCALS64+$Zsqr,$rp
1970
1971         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(S, S);
1972         add     %sp,LOCALS64+$S,$rp
1973
1974         ldx     [%sp+LOCALS64+$in_z],$bi
1975         ldx     [%sp+LOCALS64+$in_y],$a0
1976         ldx     [%sp+LOCALS64+$in_y+8],$a1
1977         ldx     [%sp+LOCALS64+$in_y+16],$a2
1978         ldx     [%sp+LOCALS64+$in_y+24],$a3
1979         add     %sp,LOCALS64+$in_z,$bp
1980         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(tmp0, in_z, in_y);
1981         add     %sp,LOCALS64+$tmp0,$rp
1982
1983         ldx     [%sp+LOCALS64+$M],$bi           ! forward load
1984         ldx     [%sp+LOCALS64+$Zsqr],$a0
1985         ldx     [%sp+LOCALS64+$Zsqr+8],$a1
1986         ldx     [%sp+LOCALS64+$Zsqr+16],$a2
1987         ldx     [%sp+LOCALS64+$Zsqr+24],$a3
1988
1989         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(res_z, tmp0);
1990         add     %sp,LOCALS64+$res_z,$rp
1991
1992         add     %sp,LOCALS64+$M,$bp
1993         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(M, M, Zsqr);
1994         add     %sp,LOCALS64+$M,$rp
1995
1996         mov     $acc0,$a0                       ! put aside M
1997         mov     $acc1,$a1
1998         mov     $acc2,$a2
1999         mov     $acc3,$a3
2000         call    __ecp_nistz256_mul_by_2_vis3
2001         add     %sp,LOCALS64+$M,$rp
2002         mov     $a0,$t0                         ! copy M
2003         ldx     [%sp+LOCALS64+$S],$a0           ! forward load
2004         mov     $a1,$t1
2005         ldx     [%sp+LOCALS64+$S+8],$a1
2006         mov     $a2,$t2
2007         ldx     [%sp+LOCALS64+$S+16],$a2
2008         mov     $a3,$t3
2009         ldx     [%sp+LOCALS64+$S+24],$a3
2010         call    __ecp_nistz256_add_noload_vis3  ! p256_mul_by_3(M, M);
2011         add     %sp,LOCALS64+$M,$rp
2012
2013         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(tmp0, S);
2014         add     %sp,LOCALS64+$tmp0,$rp
2015
2016         ldx     [%sp+LOCALS64+$S],$bi           ! forward load
2017         ldx     [%sp+LOCALS64+$in_x],$a0
2018         ldx     [%sp+LOCALS64+$in_x+8],$a1
2019         ldx     [%sp+LOCALS64+$in_x+16],$a2
2020         ldx     [%sp+LOCALS64+$in_x+24],$a3
2021
2022         call    __ecp_nistz256_div_by_2_vis3    ! p256_div_by_2(res_y, tmp0);
2023         add     %sp,LOCALS64+$res_y,$rp
2024
2025         add     %sp,LOCALS64+$S,$bp
2026         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S, S, in_x);
2027         add     %sp,LOCALS64+$S,$rp
2028
2029         ldx     [%sp+LOCALS64+$M],$a0           ! forward load
2030         ldx     [%sp+LOCALS64+$M+8],$a1
2031         ldx     [%sp+LOCALS64+$M+16],$a2
2032         ldx     [%sp+LOCALS64+$M+24],$a3
2033
2034         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(tmp0, S);
2035         add     %sp,LOCALS64+$tmp0,$rp
2036
2037         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(res_x, M);
2038         add     %sp,LOCALS64+$res_x,$rp
2039
2040         add     %sp,LOCALS64+$tmp0,$bp
2041         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_x, res_x, tmp0);
2042         add     %sp,LOCALS64+$res_x,$rp
2043
2044         ldx     [%sp+LOCALS64+$M],$a0           ! forward load
2045         ldx     [%sp+LOCALS64+$M+8],$a1
2046         ldx     [%sp+LOCALS64+$M+16],$a2
2047         ldx     [%sp+LOCALS64+$M+24],$a3
2048
2049         add     %sp,LOCALS64+$S,$bp
2050         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(S, S, res_x);
2051         add     %sp,LOCALS64+$S,$rp
2052
2053         mov     $acc0,$bi
2054         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S, S, M);
2055         add     %sp,LOCALS64+$S,$rp
2056
2057         ldx     [%sp+LOCALS64+$res_x],$a0       ! forward load
2058         ldx     [%sp+LOCALS64+$res_x+8],$a1
2059         ldx     [%sp+LOCALS64+$res_x+16],$a2
2060         ldx     [%sp+LOCALS64+$res_x+24],$a3
2061
2062         add     %sp,LOCALS64+$res_y,$bp
2063         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_y, S, res_y);
2064         add     %sp,LOCALS64+$res_y,$bp
2065
2066         ! convert output to uint_32[8]
2067         srlx    $a0,32,$t0
2068         srlx    $a1,32,$t1
2069         st      $a0,[$rp_real]                  ! res_x
2070         srlx    $a2,32,$t2
2071         st      $t0,[$rp_real+4]
2072         srlx    $a3,32,$t3
2073         st      $a1,[$rp_real+8]
2074         st      $t1,[$rp_real+12]
2075         st      $a2,[$rp_real+16]
2076         st      $t2,[$rp_real+20]
2077         st      $a3,[$rp_real+24]
2078         st      $t3,[$rp_real+28]
2079
2080         ldx     [%sp+LOCALS64+$res_z],$a0       ! forward load
2081         srlx    $acc0,32,$t0
2082         ldx     [%sp+LOCALS64+$res_z+8],$a1
2083         srlx    $acc1,32,$t1
2084         ldx     [%sp+LOCALS64+$res_z+16],$a2
2085         srlx    $acc2,32,$t2
2086         ldx     [%sp+LOCALS64+$res_z+24],$a3
2087         srlx    $acc3,32,$t3
2088         st      $acc0,[$rp_real+32]             ! res_y
2089         st      $t0,  [$rp_real+32+4]
2090         st      $acc1,[$rp_real+32+8]
2091         st      $t1,  [$rp_real+32+12]
2092         st      $acc2,[$rp_real+32+16]
2093         st      $t2,  [$rp_real+32+20]
2094         st      $acc3,[$rp_real+32+24]
2095         st      $t3,  [$rp_real+32+28]
2096
2097         srlx    $a0,32,$t0
2098         srlx    $a1,32,$t1
2099         st      $a0,[$rp_real+64]               ! res_z
2100         srlx    $a2,32,$t2
2101         st      $t0,[$rp_real+64+4]
2102         srlx    $a3,32,$t3
2103         st      $a1,[$rp_real+64+8]
2104         st      $t1,[$rp_real+64+12]
2105         st      $a2,[$rp_real+64+16]
2106         st      $t2,[$rp_real+64+20]
2107         st      $a3,[$rp_real+64+24]
2108         st      $t3,[$rp_real+64+28]
2109
2110         ret
2111         restore
2112 .type   ecp_nistz256_point_double_vis3,#function
2113 .size   ecp_nistz256_point_double_vis3,.-ecp_nistz256_point_double_vis3
2114 ___
2115 }
2116 ########################################################################
2117 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
2118 #                             const P256_POINT *in2);
2119 {
2120 my ($res_x,$res_y,$res_z,
2121     $in1_x,$in1_y,$in1_z,
2122     $in2_x,$in2_y,$in2_z,
2123     $H,$Hsqr,$R,$Rsqr,$Hcub,
2124     $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
2125 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
2126
2127 # above map() describes stack layout with 18 temporary
2128 # 256-bit vectors on top. Then we reserve some space for
2129 # !in1infty, !in2infty and result of check for zero.
2130
2131 $code.=<<___;
2132 .globl  ecp_nistz256_point_add_vis3
2133 .align  32
2134 ecp_nistz256_point_add_vis3:
2135         save    %sp,-STACK64_FRAME-32*18-32,%sp
2136
2137         mov     $rp,$rp_real
2138         mov     -1,$minus1
2139         mov     -2,$poly3
2140         sllx    $minus1,32,$poly1               ! 0xFFFFFFFF00000000
2141         srl     $poly3,0,$poly3                 ! 0x00000000FFFFFFFE
2142
2143         ! convert input to uint64_t[4]
2144         ld      [$bp],$a0                       ! in2_x
2145         ld      [$bp+4],$t0
2146         ld      [$bp+8],$a1
2147         ld      [$bp+12],$t1
2148         ld      [$bp+16],$a2
2149         ld      [$bp+20],$t2
2150         ld      [$bp+24],$a3
2151         ld      [$bp+28],$t3
2152         sllx    $t0,32,$t0
2153         sllx    $t1,32,$t1
2154         ld      [$bp+32],$acc0                  ! in2_y
2155         or      $a0,$t0,$a0
2156         ld      [$bp+32+4],$t0
2157         sllx    $t2,32,$t2
2158         ld      [$bp+32+8],$acc1
2159         or      $a1,$t1,$a1
2160         ld      [$bp+32+12],$t1
2161         sllx    $t3,32,$t3
2162         ld      [$bp+32+16],$acc2
2163         or      $a2,$t2,$a2
2164         ld      [$bp+32+20],$t2
2165         or      $a3,$t3,$a3
2166         ld      [$bp+32+24],$acc3
2167         sllx    $t0,32,$t0
2168         ld      [$bp+32+28],$t3
2169         sllx    $t1,32,$t1
2170         stx     $a0,[%sp+LOCALS64+$in2_x]
2171         sllx    $t2,32,$t2
2172         stx     $a1,[%sp+LOCALS64+$in2_x+8]
2173         sllx    $t3,32,$t3
2174         stx     $a2,[%sp+LOCALS64+$in2_x+16]
2175         or      $acc0,$t0,$acc0
2176         stx     $a3,[%sp+LOCALS64+$in2_x+24]
2177         or      $acc1,$t1,$acc1
2178         stx     $acc0,[%sp+LOCALS64+$in2_y]
2179         or      $acc2,$t2,$acc2
2180         stx     $acc1,[%sp+LOCALS64+$in2_y+8]
2181         or      $acc3,$t3,$acc3
2182         stx     $acc2,[%sp+LOCALS64+$in2_y+16]
2183         stx     $acc3,[%sp+LOCALS64+$in2_y+24]
2184
2185         ld      [$bp+64],$acc0                  ! in2_z
2186         ld      [$bp+64+4],$t0
2187         ld      [$bp+64+8],$acc1
2188         ld      [$bp+64+12],$t1
2189         ld      [$bp+64+16],$acc2
2190         ld      [$bp+64+20],$t2
2191         ld      [$bp+64+24],$acc3
2192         ld      [$bp+64+28],$t3
2193         sllx    $t0,32,$t0
2194         sllx    $t1,32,$t1
2195         ld      [$ap],$a0                       ! in1_x
2196         or      $acc0,$t0,$acc0
2197         ld      [$ap+4],$t0
2198         sllx    $t2,32,$t2
2199         ld      [$ap+8],$a1
2200         or      $acc1,$t1,$acc1
2201         ld      [$ap+12],$t1
2202         sllx    $t3,32,$t3
2203         ld      [$ap+16],$a2
2204         or      $acc2,$t2,$acc2
2205         ld      [$ap+20],$t2
2206         or      $acc3,$t3,$acc3
2207         ld      [$ap+24],$a3
2208         sllx    $t0,32,$t0
2209         ld      [$ap+28],$t3
2210         sllx    $t1,32,$t1
2211         stx     $acc0,[%sp+LOCALS64+$in2_z]
2212         sllx    $t2,32,$t2
2213         stx     $acc1,[%sp+LOCALS64+$in2_z+8]
2214         sllx    $t3,32,$t3
2215         stx     $acc2,[%sp+LOCALS64+$in2_z+16]
2216         stx     $acc3,[%sp+LOCALS64+$in2_z+24]
2217
2218         or      $acc1,$acc0,$acc0
2219         or      $acc3,$acc2,$acc2
2220         or      $acc2,$acc0,$acc0
2221         movrnz  $acc0,-1,$acc0                  ! !in2infty
2222         stx     $acc0,[%fp+STACK_BIAS-8]
2223
2224         or      $a0,$t0,$a0
2225         ld      [$ap+32],$acc0                  ! in1_y
2226         or      $a1,$t1,$a1
2227         ld      [$ap+32+4],$t0
2228         or      $a2,$t2,$a2
2229         ld      [$ap+32+8],$acc1
2230         or      $a3,$t3,$a3
2231         ld      [$ap+32+12],$t1
2232         ld      [$ap+32+16],$acc2
2233         ld      [$ap+32+20],$t2
2234         ld      [$ap+32+24],$acc3
2235         sllx    $t0,32,$t0
2236         ld      [$ap+32+28],$t3
2237         sllx    $t1,32,$t1
2238         stx     $a0,[%sp+LOCALS64+$in1_x]
2239         sllx    $t2,32,$t2
2240         stx     $a1,[%sp+LOCALS64+$in1_x+8]
2241         sllx    $t3,32,$t3
2242         stx     $a2,[%sp+LOCALS64+$in1_x+16]
2243         or      $acc0,$t0,$acc0
2244         stx     $a3,[%sp+LOCALS64+$in1_x+24]
2245         or      $acc1,$t1,$acc1
2246         stx     $acc0,[%sp+LOCALS64+$in1_y]
2247         or      $acc2,$t2,$acc2
2248         stx     $acc1,[%sp+LOCALS64+$in1_y+8]
2249         or      $acc3,$t3,$acc3
2250         stx     $acc2,[%sp+LOCALS64+$in1_y+16]
2251         stx     $acc3,[%sp+LOCALS64+$in1_y+24]
2252
2253         ldx     [%sp+LOCALS64+$in2_z],$a0       ! forward load
2254         ldx     [%sp+LOCALS64+$in2_z+8],$a1
2255         ldx     [%sp+LOCALS64+$in2_z+16],$a2
2256         ldx     [%sp+LOCALS64+$in2_z+24],$a3
2257
2258         ld      [$ap+64],$acc0                  ! in1_z
2259         ld      [$ap+64+4],$t0
2260         ld      [$ap+64+8],$acc1
2261         ld      [$ap+64+12],$t1
2262         ld      [$ap+64+16],$acc2
2263         ld      [$ap+64+20],$t2
2264         ld      [$ap+64+24],$acc3
2265         ld      [$ap+64+28],$t3
2266         sllx    $t0,32,$t0
2267         sllx    $t1,32,$t1
2268         or      $acc0,$t0,$acc0
2269         sllx    $t2,32,$t2
2270         or      $acc1,$t1,$acc1
2271         sllx    $t3,32,$t3
2272         stx     $acc0,[%sp+LOCALS64+$in1_z]
2273         or      $acc2,$t2,$acc2
2274         stx     $acc1,[%sp+LOCALS64+$in1_z+8]
2275         or      $acc3,$t3,$acc3
2276         stx     $acc2,[%sp+LOCALS64+$in1_z+16]
2277         stx     $acc3,[%sp+LOCALS64+$in1_z+24]
2278
2279         or      $acc1,$acc0,$acc0
2280         or      $acc3,$acc2,$acc2
2281         or      $acc2,$acc0,$acc0
2282         movrnz  $acc0,-1,$acc0                  ! !in1infty
2283         stx     $acc0,[%fp+STACK_BIAS-16]
2284
2285         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Z2sqr, in2_z);
2286         add     %sp,LOCALS64+$Z2sqr,$rp
2287
2288         ldx     [%sp+LOCALS64+$in1_z],$a0
2289         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2290         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2291         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2292         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Z1sqr, in1_z);
2293         add     %sp,LOCALS64+$Z1sqr,$rp
2294
2295         ldx     [%sp+LOCALS64+$Z2sqr],$bi
2296         ldx     [%sp+LOCALS64+$in2_z],$a0
2297         ldx     [%sp+LOCALS64+$in2_z+8],$a1
2298         ldx     [%sp+LOCALS64+$in2_z+16],$a2
2299         ldx     [%sp+LOCALS64+$in2_z+24],$a3
2300         add     %sp,LOCALS64+$Z2sqr,$bp
2301         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S1, Z2sqr, in2_z);
2302         add     %sp,LOCALS64+$S1,$rp
2303
2304         ldx     [%sp+LOCALS64+$Z1sqr],$bi
2305         ldx     [%sp+LOCALS64+$in1_z],$a0
2306         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2307         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2308         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2309         add     %sp,LOCALS64+$Z1sqr,$bp
2310         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, Z1sqr, in1_z);
2311         add     %sp,LOCALS64+$S2,$rp
2312
2313         ldx     [%sp+LOCALS64+$S1],$bi
2314         ldx     [%sp+LOCALS64+$in1_y],$a0
2315         ldx     [%sp+LOCALS64+$in1_y+8],$a1
2316         ldx     [%sp+LOCALS64+$in1_y+16],$a2
2317         ldx     [%sp+LOCALS64+$in1_y+24],$a3
2318         add     %sp,LOCALS64+$S1,$bp
2319         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S1, S1, in1_y);
2320         add     %sp,LOCALS64+$S1,$rp
2321
2322         ldx     [%sp+LOCALS64+$S2],$bi
2323         ldx     [%sp+LOCALS64+$in2_y],$a0
2324         ldx     [%sp+LOCALS64+$in2_y+8],$a1
2325         ldx     [%sp+LOCALS64+$in2_y+16],$a2
2326         ldx     [%sp+LOCALS64+$in2_y+24],$a3
2327         add     %sp,LOCALS64+$S2,$bp
2328         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, S2, in2_y);
2329         add     %sp,LOCALS64+$S2,$rp
2330
2331         ldx     [%sp+LOCALS64+$Z2sqr],$bi       ! forward load
2332         ldx     [%sp+LOCALS64+$in1_x],$a0
2333         ldx     [%sp+LOCALS64+$in1_x+8],$a1
2334         ldx     [%sp+LOCALS64+$in1_x+16],$a2
2335         ldx     [%sp+LOCALS64+$in1_x+24],$a3
2336
2337         add     %sp,LOCALS64+$S1,$bp
2338         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(R, S2, S1);
2339         add     %sp,LOCALS64+$R,$rp
2340
2341         or      $acc1,$acc0,$acc0               ! see if result is zero
2342         or      $acc3,$acc2,$acc2
2343         or      $acc2,$acc0,$acc0
2344         stx     $acc0,[%fp+STACK_BIAS-24]
2345
2346         add     %sp,LOCALS64+$Z2sqr,$bp
2347         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U1, in1_x, Z2sqr);
2348         add     %sp,LOCALS64+$U1,$rp
2349
2350         ldx     [%sp+LOCALS64+$Z1sqr],$bi
2351         ldx     [%sp+LOCALS64+$in2_x],$a0
2352         ldx     [%sp+LOCALS64+$in2_x+8],$a1
2353         ldx     [%sp+LOCALS64+$in2_x+16],$a2
2354         ldx     [%sp+LOCALS64+$in2_x+24],$a3
2355         add     %sp,LOCALS64+$Z1sqr,$bp
2356         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, in2_x, Z1sqr);
2357         add     %sp,LOCALS64+$U2,$rp
2358
2359         ldx     [%sp+LOCALS64+$R],$a0           ! forward load
2360         ldx     [%sp+LOCALS64+$R+8],$a1
2361         ldx     [%sp+LOCALS64+$R+16],$a2
2362         ldx     [%sp+LOCALS64+$R+24],$a3
2363
2364         add     %sp,LOCALS64+$U1,$bp
2365         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(H, U2, U1);
2366         add     %sp,LOCALS64+$H,$rp
2367
2368         or      $acc1,$acc0,$acc0               ! see if result is zero
2369         or      $acc3,$acc2,$acc2
2370         orcc    $acc2,$acc0,$acc0
2371
2372         bne,pt  %xcc,.Ladd_proceed_vis3         ! is_equal(U1,U2)?
2373         nop
2374
2375         ldx     [%fp+STACK_BIAS-8],$t0
2376         ldx     [%fp+STACK_BIAS-16],$t1
2377         ldx     [%fp+STACK_BIAS-24],$t2
2378         andcc   $t0,$t1,%g0
2379         be,pt   %xcc,.Ladd_proceed_vis3         ! (in1infty || in2infty)?
2380         nop
2381         andcc   $t2,$t2,%g0
2382         be,a,pt %xcc,.Ldouble_shortcut_vis3     ! is_equal(S1,S2)?
2383         add     %sp,32*(12-10)+32,%sp           ! difference in frame sizes
2384
2385         st      %g0,[$rp_real]
2386         st      %g0,[$rp_real+4]
2387         st      %g0,[$rp_real+8]
2388         st      %g0,[$rp_real+12]
2389         st      %g0,[$rp_real+16]
2390         st      %g0,[$rp_real+20]
2391         st      %g0,[$rp_real+24]
2392         st      %g0,[$rp_real+28]
2393         st      %g0,[$rp_real+32]
2394         st      %g0,[$rp_real+32+4]
2395         st      %g0,[$rp_real+32+8]
2396         st      %g0,[$rp_real+32+12]
2397         st      %g0,[$rp_real+32+16]
2398         st      %g0,[$rp_real+32+20]
2399         st      %g0,[$rp_real+32+24]
2400         st      %g0,[$rp_real+32+28]
2401         st      %g0,[$rp_real+64]
2402         st      %g0,[$rp_real+64+4]
2403         st      %g0,[$rp_real+64+8]
2404         st      %g0,[$rp_real+64+12]
2405         st      %g0,[$rp_real+64+16]
2406         st      %g0,[$rp_real+64+20]
2407         st      %g0,[$rp_real+64+24]
2408         st      %g0,[$rp_real+64+28]
2409         b       .Ladd_done_vis3
2410         nop
2411
2412 .align  16
2413 .Ladd_proceed_vis3:
2414         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Rsqr, R);
2415         add     %sp,LOCALS64+$Rsqr,$rp
2416
2417         ldx     [%sp+LOCALS64+$H],$bi
2418         ldx     [%sp+LOCALS64+$in1_z],$a0
2419         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2420         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2421         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2422         add     %sp,LOCALS64+$H,$bp
2423         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_z, H, in1_z);
2424         add     %sp,LOCALS64+$res_z,$rp
2425
2426         ldx     [%sp+LOCALS64+$H],$a0
2427         ldx     [%sp+LOCALS64+$H+8],$a1
2428         ldx     [%sp+LOCALS64+$H+16],$a2
2429         ldx     [%sp+LOCALS64+$H+24],$a3
2430         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Hsqr, H);
2431         add     %sp,LOCALS64+$Hsqr,$rp
2432
2433         ldx     [%sp+LOCALS64+$res_z],$bi
2434         ldx     [%sp+LOCALS64+$in2_z],$a0
2435         ldx     [%sp+LOCALS64+$in2_z+8],$a1
2436         ldx     [%sp+LOCALS64+$in2_z+16],$a2
2437         ldx     [%sp+LOCALS64+$in2_z+24],$a3
2438         add     %sp,LOCALS64+$res_z,$bp
2439         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_z, res_z, in2_z);
2440         add     %sp,LOCALS64+$res_z,$rp
2441
2442         ldx     [%sp+LOCALS64+$H],$bi
2443         ldx     [%sp+LOCALS64+$Hsqr],$a0
2444         ldx     [%sp+LOCALS64+$Hsqr+8],$a1
2445         ldx     [%sp+LOCALS64+$Hsqr+16],$a2
2446         ldx     [%sp+LOCALS64+$Hsqr+24],$a3
2447         add     %sp,LOCALS64+$H,$bp
2448         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(Hcub, Hsqr, H);
2449         add     %sp,LOCALS64+$Hcub,$rp
2450
2451         ldx     [%sp+LOCALS64+$U1],$bi
2452         ldx     [%sp+LOCALS64+$Hsqr],$a0
2453         ldx     [%sp+LOCALS64+$Hsqr+8],$a1
2454         ldx     [%sp+LOCALS64+$Hsqr+16],$a2
2455         ldx     [%sp+LOCALS64+$Hsqr+24],$a3
2456         add     %sp,LOCALS64+$U1,$bp
2457         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, U1, Hsqr);
2458         add     %sp,LOCALS64+$U2,$rp
2459
2460         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(Hsqr, U2);
2461         add     %sp,LOCALS64+$Hsqr,$rp
2462
2463         add     %sp,LOCALS64+$Rsqr,$bp
2464         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_x, Rsqr, Hsqr);
2465         add     %sp,LOCALS64+$res_x,$rp
2466
2467         add     %sp,LOCALS64+$Hcub,$bp
2468         call    __ecp_nistz256_sub_from_vis3    !  p256_sub(res_x, res_x, Hcub);
2469         add     %sp,LOCALS64+$res_x,$rp
2470
2471         ldx     [%sp+LOCALS64+$S1],$bi          ! forward load
2472         ldx     [%sp+LOCALS64+$Hcub],$a0
2473         ldx     [%sp+LOCALS64+$Hcub+8],$a1
2474         ldx     [%sp+LOCALS64+$Hcub+16],$a2
2475         ldx     [%sp+LOCALS64+$Hcub+24],$a3
2476
2477         add     %sp,LOCALS64+$U2,$bp
2478         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_y, U2, res_x);
2479         add     %sp,LOCALS64+$res_y,$rp
2480
2481         add     %sp,LOCALS64+$S1,$bp
2482         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, S1, Hcub);
2483         add     %sp,LOCALS64+$S2,$rp
2484
2485         ldx     [%sp+LOCALS64+$R],$bi
2486         ldx     [%sp+LOCALS64+$res_y],$a0
2487         ldx     [%sp+LOCALS64+$res_y+8],$a1
2488         ldx     [%sp+LOCALS64+$res_y+16],$a2
2489         ldx     [%sp+LOCALS64+$res_y+24],$a3
2490         add     %sp,LOCALS64+$R,$bp
2491         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_y, res_y, R);
2492         add     %sp,LOCALS64+$res_y,$rp
2493
2494         add     %sp,LOCALS64+$S2,$bp
2495         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_y, res_y, S2);
2496         add     %sp,LOCALS64+$res_y,$rp
2497
2498         ldx     [%fp+STACK_BIAS-16],$t1         ! !in1infty
2499         ldx     [%fp+STACK_BIAS-8],$t2          ! !in2infty
2500 ___
2501 for($i=0;$i<96;$i+=16) {                        # conditional moves
2502 $code.=<<___;
2503         ldx     [%sp+LOCALS64+$res_x+$i],$acc0  ! res
2504         ldx     [%sp+LOCALS64+$res_x+$i+8],$acc1
2505         ldx     [%sp+LOCALS64+$in2_x+$i],$acc2  ! in2
2506         ldx     [%sp+LOCALS64+$in2_x+$i+8],$acc3
2507         ldx     [%sp+LOCALS64+$in1_x+$i],$acc4  ! in1
2508         ldx     [%sp+LOCALS64+$in1_x+$i+8],$acc5
2509         movrz   $t1,$acc2,$acc0
2510         movrz   $t1,$acc3,$acc1
2511         movrz   $t2,$acc4,$acc0
2512         movrz   $t2,$acc5,$acc1
2513         srlx    $acc0,32,$acc2
2514         srlx    $acc1,32,$acc3
2515         st      $acc0,[$rp_real+$i]
2516         st      $acc2,[$rp_real+$i+4]
2517         st      $acc1,[$rp_real+$i+8]
2518         st      $acc3,[$rp_real+$i+12]
2519 ___
2520 }
2521 $code.=<<___;
2522 .Ladd_done_vis3:
2523         ret
2524         restore
2525 .type   ecp_nistz256_point_add_vis3,#function
2526 .size   ecp_nistz256_point_add_vis3,.-ecp_nistz256_point_add_vis3
2527 ___
2528 }
2529 ########################################################################
2530 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
2531 #                                    const P256_POINT_AFFINE *in2);
2532 {
2533 my ($res_x,$res_y,$res_z,
2534     $in1_x,$in1_y,$in1_z,
2535     $in2_x,$in2_y,
2536     $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
2537 my $Z1sqr = $S2;
2538 # above map() describes stack layout with 15 temporary
2539 # 256-bit vectors on top. Then we reserve some space for
2540 # !in1infty and !in2infty.
2541
2542 $code.=<<___;
2543 .align  32
2544 ecp_nistz256_point_add_affine_vis3:
2545         save    %sp,-STACK64_FRAME-32*15-32,%sp
2546
2547         mov     $rp,$rp_real
2548         mov     -1,$minus1
2549         mov     -2,$poly3
2550         sllx    $minus1,32,$poly1               ! 0xFFFFFFFF00000000
2551         srl     $poly3,0,$poly3                 ! 0x00000000FFFFFFFE
2552
2553         ! convert input to uint64_t[4]
2554         ld      [$bp],$a0                       ! in2_x
2555         ld      [$bp+4],$t0
2556         ld      [$bp+8],$a1
2557         ld      [$bp+12],$t1
2558         ld      [$bp+16],$a2
2559         ld      [$bp+20],$t2
2560         ld      [$bp+24],$a3
2561         ld      [$bp+28],$t3
2562         sllx    $t0,32,$t0
2563         sllx    $t1,32,$t1
2564         ld      [$bp+32],$acc0                  ! in2_y
2565         or      $a0,$t0,$a0
2566         ld      [$bp+32+4],$t0
2567         sllx    $t2,32,$t2
2568         ld      [$bp+32+8],$acc1
2569         or      $a1,$t1,$a1
2570         ld      [$bp+32+12],$t1
2571         sllx    $t3,32,$t3
2572         ld      [$bp+32+16],$acc2
2573         or      $a2,$t2,$a2
2574         ld      [$bp+32+20],$t2
2575         or      $a3,$t3,$a3
2576         ld      [$bp+32+24],$acc3
2577         sllx    $t0,32,$t0
2578         ld      [$bp+32+28],$t3
2579         sllx    $t1,32,$t1
2580         stx     $a0,[%sp+LOCALS64+$in2_x]
2581         sllx    $t2,32,$t2
2582         stx     $a1,[%sp+LOCALS64+$in2_x+8]
2583         sllx    $t3,32,$t3
2584         stx     $a2,[%sp+LOCALS64+$in2_x+16]
2585         or      $acc0,$t0,$acc0
2586         stx     $a3,[%sp+LOCALS64+$in2_x+24]
2587         or      $acc1,$t1,$acc1
2588         stx     $acc0,[%sp+LOCALS64+$in2_y]
2589         or      $acc2,$t2,$acc2
2590         stx     $acc1,[%sp+LOCALS64+$in2_y+8]
2591         or      $acc3,$t3,$acc3
2592         stx     $acc2,[%sp+LOCALS64+$in2_y+16]
2593         stx     $acc3,[%sp+LOCALS64+$in2_y+24]
2594
2595         or      $a1,$a0,$a0
2596         or      $a3,$a2,$a2
2597         or      $acc1,$acc0,$acc0
2598         or      $acc3,$acc2,$acc2
2599         or      $a2,$a0,$a0
2600         or      $acc2,$acc0,$acc0
2601         or      $acc0,$a0,$a0
2602         movrnz  $a0,-1,$a0                      ! !in2infty
2603         stx     $a0,[%fp+STACK_BIAS-8]
2604
2605         ld      [$ap],$a0                       ! in1_x
2606         ld      [$ap+4],$t0
2607         ld      [$ap+8],$a1
2608         ld      [$ap+12],$t1
2609         ld      [$ap+16],$a2
2610         ld      [$ap+20],$t2
2611         ld      [$ap+24],$a3
2612         ld      [$ap+28],$t3
2613         sllx    $t0,32,$t0
2614         sllx    $t1,32,$t1
2615         ld      [$ap+32],$acc0                  ! in1_y
2616         or      $a0,$t0,$a0
2617         ld      [$ap+32+4],$t0
2618         sllx    $t2,32,$t2
2619         ld      [$ap+32+8],$acc1
2620         or      $a1,$t1,$a1
2621         ld      [$ap+32+12],$t1
2622         sllx    $t3,32,$t3
2623         ld      [$ap+32+16],$acc2
2624         or      $a2,$t2,$a2
2625         ld      [$ap+32+20],$t2
2626         or      $a3,$t3,$a3
2627         ld      [$ap+32+24],$acc3
2628         sllx    $t0,32,$t0
2629         ld      [$ap+32+28],$t3
2630         sllx    $t1,32,$t1
2631         stx     $a0,[%sp+LOCALS64+$in1_x]
2632         sllx    $t2,32,$t2
2633         stx     $a1,[%sp+LOCALS64+$in1_x+8]
2634         sllx    $t3,32,$t3
2635         stx     $a2,[%sp+LOCALS64+$in1_x+16]
2636         or      $acc0,$t0,$acc0
2637         stx     $a3,[%sp+LOCALS64+$in1_x+24]
2638         or      $acc1,$t1,$acc1
2639         stx     $acc0,[%sp+LOCALS64+$in1_y]
2640         or      $acc2,$t2,$acc2
2641         stx     $acc1,[%sp+LOCALS64+$in1_y+8]
2642         or      $acc3,$t3,$acc3
2643         stx     $acc2,[%sp+LOCALS64+$in1_y+16]
2644         stx     $acc3,[%sp+LOCALS64+$in1_y+24]
2645
2646         ld      [$ap+64],$a0                    ! in1_z
2647         ld      [$ap+64+4],$t0
2648         ld      [$ap+64+8],$a1
2649         ld      [$ap+64+12],$t1
2650         ld      [$ap+64+16],$a2
2651         ld      [$ap+64+20],$t2
2652         ld      [$ap+64+24],$a3
2653         ld      [$ap+64+28],$t3
2654         sllx    $t0,32,$t0
2655         sllx    $t1,32,$t1
2656         or      $a0,$t0,$a0
2657         sllx    $t2,32,$t2
2658         or      $a1,$t1,$a1
2659         sllx    $t3,32,$t3
2660         stx     $a0,[%sp+LOCALS64+$in1_z]
2661         or      $a2,$t2,$a2
2662         stx     $a1,[%sp+LOCALS64+$in1_z+8]
2663         or      $a3,$t3,$a3
2664         stx     $a2,[%sp+LOCALS64+$in1_z+16]
2665         stx     $a3,[%sp+LOCALS64+$in1_z+24]
2666
2667         or      $a1,$a0,$t0
2668         or      $a3,$a2,$t2
2669         or      $t2,$t0,$t0
2670         movrnz  $t0,-1,$t0                      ! !in1infty
2671         stx     $t0,[%fp+STACK_BIAS-16]
2672
2673         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Z1sqr, in1_z);
2674         add     %sp,LOCALS64+$Z1sqr,$rp
2675
2676         ldx     [%sp+LOCALS64+$in2_x],$bi
2677         mov     $acc0,$a0
2678         mov     $acc1,$a1
2679         mov     $acc2,$a2
2680         mov     $acc3,$a3
2681         add     %sp,LOCALS64+$in2_x,$bp
2682         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, Z1sqr, in2_x);
2683         add     %sp,LOCALS64+$U2,$rp
2684
2685         ldx     [%sp+LOCALS64+$Z1sqr],$bi       ! forward load
2686         ldx     [%sp+LOCALS64+$in1_z],$a0
2687         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2688         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2689         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2690
2691         add     %sp,LOCALS64+$in1_x,$bp
2692         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(H, U2, in1_x);
2693         add     %sp,LOCALS64+$H,$rp
2694
2695         add     %sp,LOCALS64+$Z1sqr,$bp
2696         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, Z1sqr, in1_z);
2697         add     %sp,LOCALS64+$S2,$rp
2698
2699         ldx     [%sp+LOCALS64+$H],$bi
2700         ldx     [%sp+LOCALS64+$in1_z],$a0
2701         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2702         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2703         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2704         add     %sp,LOCALS64+$H,$bp
2705         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_z, H, in1_z);
2706         add     %sp,LOCALS64+$res_z,$rp
2707
2708         ldx     [%sp+LOCALS64+$S2],$bi
2709         ldx     [%sp+LOCALS64+$in2_y],$a0
2710         ldx     [%sp+LOCALS64+$in2_y+8],$a1
2711         ldx     [%sp+LOCALS64+$in2_y+16],$a2
2712         ldx     [%sp+LOCALS64+$in2_y+24],$a3
2713         add     %sp,LOCALS64+$S2,$bp
2714         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, S2, in2_y);
2715         add     %sp,LOCALS64+$S2,$rp
2716
2717         ldx     [%sp+LOCALS64+$H],$a0           ! forward load
2718         ldx     [%sp+LOCALS64+$H+8],$a1
2719         ldx     [%sp+LOCALS64+$H+16],$a2
2720         ldx     [%sp+LOCALS64+$H+24],$a3
2721
2722         add     %sp,LOCALS64+$in1_y,$bp
2723         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(R, S2, in1_y);
2724         add     %sp,LOCALS64+$R,$rp
2725
2726         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Hsqr, H);
2727         add     %sp,LOCALS64+$Hsqr,$rp
2728
2729         ldx     [%sp+LOCALS64+$R],$a0
2730         ldx     [%sp+LOCALS64+$R+8],$a1
2731         ldx     [%sp+LOCALS64+$R+16],$a2
2732         ldx     [%sp+LOCALS64+$R+24],$a3
2733         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Rsqr, R);
2734         add     %sp,LOCALS64+$Rsqr,$rp
2735
2736         ldx     [%sp+LOCALS64+$H],$bi
2737         ldx     [%sp+LOCALS64+$Hsqr],$a0
2738         ldx     [%sp+LOCALS64+$Hsqr+8],$a1
2739         ldx     [%sp+LOCALS64+$Hsqr+16],$a2
2740         ldx     [%sp+LOCALS64+$Hsqr+24],$a3
2741         add     %sp,LOCALS64+$H,$bp
2742         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(Hcub, Hsqr, H);
2743         add     %sp,LOCALS64+$Hcub,$rp
2744
2745         ldx     [%sp+LOCALS64+$Hsqr],$bi
2746         ldx     [%sp+LOCALS64+$in1_x],$a0
2747         ldx     [%sp+LOCALS64+$in1_x+8],$a1
2748         ldx     [%sp+LOCALS64+$in1_x+16],$a2
2749         ldx     [%sp+LOCALS64+$in1_x+24],$a3
2750         add     %sp,LOCALS64+$Hsqr,$bp
2751         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, in1_x, Hsqr);
2752         add     %sp,LOCALS64+$U2,$rp
2753
2754         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(Hsqr, U2);
2755         add     %sp,LOCALS64+$Hsqr,$rp
2756
2757         add     %sp,LOCALS64+$Rsqr,$bp
2758         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_x, Rsqr, Hsqr);
2759         add     %sp,LOCALS64+$res_x,$rp
2760
2761         add     %sp,LOCALS64+$Hcub,$bp
2762         call    __ecp_nistz256_sub_from_vis3    !  p256_sub(res_x, res_x, Hcub);
2763         add     %sp,LOCALS64+$res_x,$rp
2764
2765         ldx     [%sp+LOCALS64+$Hcub],$bi        ! forward load
2766         ldx     [%sp+LOCALS64+$in1_y],$a0
2767         ldx     [%sp+LOCALS64+$in1_y+8],$a1
2768         ldx     [%sp+LOCALS64+$in1_y+16],$a2
2769         ldx     [%sp+LOCALS64+$in1_y+24],$a3
2770
2771         add     %sp,LOCALS64+$U2,$bp
2772         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_y, U2, res_x);
2773         add     %sp,LOCALS64+$res_y,$rp
2774
2775         add     %sp,LOCALS64+$Hcub,$bp
2776         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, in1_y, Hcub);
2777         add     %sp,LOCALS64+$S2,$rp
2778
2779         ldx     [%sp+LOCALS64+$R],$bi
2780         ldx     [%sp+LOCALS64+$res_y],$a0
2781         ldx     [%sp+LOCALS64+$res_y+8],$a1
2782         ldx     [%sp+LOCALS64+$res_y+16],$a2
2783         ldx     [%sp+LOCALS64+$res_y+24],$a3
2784         add     %sp,LOCALS64+$R,$bp
2785         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_y, res_y, R);
2786         add     %sp,LOCALS64+$res_y,$rp
2787
2788         add     %sp,LOCALS64+$S2,$bp
2789         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_y, res_y, S2);
2790         add     %sp,LOCALS64+$res_y,$rp
2791
2792         ldx     [%fp+STACK_BIAS-16],$t1         ! !in1infty
2793         ldx     [%fp+STACK_BIAS-8],$t2          ! !in2infty
2794 1:      call    .+8
2795         add     %o7,.Lone_mont_vis3-1b,$bp
2796 ___
2797 for($i=0;$i<64;$i+=16) {                        # conditional moves
2798 $code.=<<___;
2799         ldx     [%sp+LOCALS64+$res_x+$i],$acc0  ! res
2800         ldx     [%sp+LOCALS64+$res_x+$i+8],$acc1
2801         ldx     [%sp+LOCALS64+$in2_x+$i],$acc2  ! in2
2802         ldx     [%sp+LOCALS64+$in2_x+$i+8],$acc3
2803         ldx     [%sp+LOCALS64+$in1_x+$i],$acc4  ! in1
2804         ldx     [%sp+LOCALS64+$in1_x+$i+8],$acc5
2805         movrz   $t1,$acc2,$acc0
2806         movrz   $t1,$acc3,$acc1
2807         movrz   $t2,$acc4,$acc0
2808         movrz   $t2,$acc5,$acc1
2809         srlx    $acc0,32,$acc2
2810         srlx    $acc1,32,$acc3
2811         st      $acc0,[$rp_real+$i]
2812         st      $acc2,[$rp_real+$i+4]
2813         st      $acc1,[$rp_real+$i+8]
2814         st      $acc3,[$rp_real+$i+12]
2815 ___
2816 }
2817 for(;$i<96;$i+=16) {
2818 $code.=<<___;
2819         ldx     [%sp+LOCALS64+$res_x+$i],$acc0  ! res
2820         ldx     [%sp+LOCALS64+$res_x+$i+8],$acc1
2821         ldx     [$bp+$i-64],$acc2               ! "in2"
2822         ldx     [$bp+$i-64+8],$acc3
2823         ldx     [%sp+LOCALS64+$in1_x+$i],$acc4  ! in1
2824         ldx     [%sp+LOCALS64+$in1_x+$i+8],$acc5
2825         movrz   $t1,$acc2,$acc0
2826         movrz   $t1,$acc3,$acc1
2827         movrz   $t2,$acc4,$acc0
2828         movrz   $t2,$acc5,$acc1
2829         srlx    $acc0,32,$acc2
2830         srlx    $acc1,32,$acc3
2831         st      $acc0,[$rp_real+$i]
2832         st      $acc2,[$rp_real+$i+4]
2833         st      $acc1,[$rp_real+$i+8]
2834         st      $acc3,[$rp_real+$i+12]
2835 ___
2836 }
2837 $code.=<<___;
2838         ret
2839         restore
2840 .type   ecp_nistz256_point_add_affine_vis3,#function
2841 .size   ecp_nistz256_point_add_affine_vis3,.-ecp_nistz256_point_add_affine_vis3
2842 .align  64
2843 .Lone_mont_vis3:
2844 .long   0x00000000,0x00000001, 0xffffffff,0x00000000
2845 .long   0xffffffff,0xffffffff, 0x00000000,0xfffffffe
2846 .align  64
2847 #endif
2848 ___
2849 }                                                               }}}
2850 \f
2851 # Purpose of these subroutines is to explicitly encode VIS instructions,
2852 # so that one can compile the module without having to specify VIS
2853 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
2854 # Idea is to reserve for option to produce "universal" binary and let
2855 # programmer detect if current CPU is VIS capable at run-time.
2856 sub unvis3 {
2857 my ($mnemonic,$rs1,$rs2,$rd)=@_;
2858 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
2859 my ($ref,$opf);
2860 my %visopf = (  "addxc"         => 0x011,
2861                 "addxccc"       => 0x013,
2862                 "umulxhi"       => 0x016        );
2863
2864     $ref = "$mnemonic\t$rs1,$rs2,$rd";
2865
2866     if ($opf=$visopf{$mnemonic}) {
2867         foreach ($rs1,$rs2,$rd) {
2868             return $ref if (!/%([goli])([0-9])/);
2869             $_=$bias{$1}+$2;
2870         }
2871
2872         return  sprintf ".word\t0x%08x !%s",
2873                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
2874                         $ref;
2875     } else {
2876         return $ref;
2877     }
2878 }
2879
2880 foreach (split("\n",$code)) {
2881         s/\`([^\`]*)\`/eval $1/ge;
2882
2883         s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
2884                 &unvis3($1,$2,$3,$4)
2885          /ge;
2886
2887         print $_,"\n";
2888 }
2889
2890 close STDOUT;