release/src/router/openssl/crypto/modes/asm/ghashv8-armx.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
  11 #
  12 # June 2014
  13 #
  14 # Initial version was developed in tight cooperation with Ard
  15 # Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
  16 # other assembly modules. Just like aesv8-armx.pl this module
  17 # supports both AArch32 and AArch64 execution modes.
  18 #
  19 # July 2014
  20 #
  21 # Implement 2x aggregated reduction [see ghash-x86.pl for background
  22 # information].
  23 #
  24 # Current performance in cycles per processed byte:
  25 #
  26 #               PMULL[2]        32-bit NEON(*)
  27 # Apple A7      0.92            5.62
  28 # Cortex-A53    1.01            8.39
  29 # Cortex-A57    1.17            7.61
  30 #
  31 # (*)   presented for reference/comparison purposes;
  32
  33 $flavour = shift;
  34 open STDOUT,">".shift;
  35
  36 $Xi="x0";       # argument block
  37 $Htbl="x1";
  38 $inp="x2";
  39 $len="x3";
  40
  41 $inc="x12";
  42
  43 {
  44 my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
  45 my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
  46
  47 $code=<<___;
  48 #include "arm_arch.h"
  49
  50 .text
  51 ___
  52 $code.=".arch   armv8-a+crypto\n"       if ($flavour =~ /64/);
  53 $code.=".fpu    neon\n.code     32\n"   if ($flavour !~ /64/);
  54
  55 ################################################################################
  56 # void gcm_init_v8(u128 Htable[16],const u64 H[2]);
  57 #
  58 # input:        128-bit H - secret parameter E(K,0^128)
  59 # output:       precomputed table filled with degrees of twisted H;
  60 #               H is twisted to handle reverse bitness of GHASH;
  61 #               only few of 16 slots of Htable[16] are used;
  62 #               data is opaque to outside world (which allows to
  63 #               optimize the code independently);
  64 #
  65 $code.=<<___;
  66 .global gcm_init_v8
  67 .type   gcm_init_v8,%function
  68 .align  4
  69 gcm_init_v8:
  70         vld1.64         {$t1},[x1]              @ load input H
  71         vmov.i8         $xC2,#0xe1
  72         vshl.i64        $xC2,$xC2,#57           @ 0xc2.0
  73         vext.8          $IN,$t1,$t1,#8
  74         vshr.u64        $t2,$xC2,#63
  75         vdup.32         $t1,${t1}[1]
  76         vext.8          $t0,$t2,$xC2,#8         @ t0=0xc2....01
  77         vshr.u64        $t2,$IN,#63
  78         vshr.s32        $t1,$t1,#31             @ broadcast carry bit
  79         vand            $t2,$t2,$t0
  80         vshl.i64        $IN,$IN,#1
  81         vext.8          $t2,$t2,$t2,#8
  82         vand            $t0,$t0,$t1
  83         vorr            $IN,$IN,$t2             @ H<<<=1
  84         veor            $H,$IN,$t0              @ twisted H
  85         vst1.64         {$H},[x0],#16           @ store Htable[0]
  86
  87         @ calculate H^2
  88         vext.8          $t0,$H,$H,#8            @ Karatsuba pre-processing
  89         vpmull.p64      $Xl,$H,$H
  90         veor            $t0,$t0,$H
  91         vpmull2.p64     $Xh,$H,$H
  92         vpmull.p64      $Xm,$t0,$t0
  93
  94         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
  95         veor            $t2,$Xl,$Xh
  96         veor            $Xm,$Xm,$t1
  97         veor            $Xm,$Xm,$t2
  98         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
  99
 100         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
 101         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
 102         veor            $Xl,$Xm,$t2
 103
 104         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
 105         vpmull.p64      $Xl,$Xl,$xC2
 106         veor            $t2,$t2,$Xh
 107         veor            $H2,$Xl,$t2
 108
 109         vext.8          $t1,$H2,$H2,#8          @ Karatsuba pre-processing
 110         veor            $t1,$t1,$H2
 111         vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
 112         vst1.64         {$Hhl-$H2},[x0]         @ store Htable[1..2]
 113
 114         ret
 115 .size   gcm_init_v8,.-gcm_init_v8
 116 ___
 117 ################################################################################
 118 # void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
 119 #
 120 # input:        Xi - current hash value;
 121 #               Htable - table precomputed in gcm_init_v8;
 122 # output:       Xi - next hash value Xi;
 123 #
 124 $code.=<<___;
 125 .global gcm_gmult_v8
 126 .type   gcm_gmult_v8,%function
 127 .align  4
 128 gcm_gmult_v8:
 129         vld1.64         {$t1},[$Xi]             @ load Xi
 130         vmov.i8         $xC2,#0xe1
 131         vld1.64         {$H-$Hhl},[$Htbl]       @ load twisted H, ...
 132         vshl.u64        $xC2,$xC2,#57
 133 #ifndef __ARMEB__
 134         vrev64.8        $t1,$t1
 135 #endif
 136         vext.8          $IN,$t1,$t1,#8
 137
 138         vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
 139         veor            $t1,$t1,$IN             @ Karatsuba pre-processing
 140         vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
 141         vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
 142
 143         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
 144         veor            $t2,$Xl,$Xh
 145         veor            $Xm,$Xm,$t1
 146         veor            $Xm,$Xm,$t2
 147         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
 148
 149         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
 150         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
 151         veor            $Xl,$Xm,$t2
 152
 153         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
 154         vpmull.p64      $Xl,$Xl,$xC2
 155         veor            $t2,$t2,$Xh
 156         veor            $Xl,$Xl,$t2
 157
 158 #ifndef __ARMEB__
 159         vrev64.8        $Xl,$Xl
 160 #endif
 161         vext.8          $Xl,$Xl,$Xl,#8
 162         vst1.64         {$Xl},[$Xi]             @ write out Xi
 163
 164         ret
 165 .size   gcm_gmult_v8,.-gcm_gmult_v8
 166 ___
 167 ################################################################################
 168 # void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 169 #
 170 # input:        table precomputed in gcm_init_v8;
 171 #               current hash value Xi;
 172 #               pointer to input data;
 173 #               length of input data in bytes, but divisible by block size;
 174 # output:       next hash value Xi;
 175 #
 176 $code.=<<___;
 177 .global gcm_ghash_v8
 178 .type   gcm_ghash_v8,%function
 179 .align  4
 180 gcm_ghash_v8:
 181 ___
 182 $code.=<<___            if ($flavour !~ /64/);
 183         vstmdb          sp!,{d8-d15}            @ 32-bit ABI says so
 184 ___
 185 $code.=<<___;
 186         vld1.64         {$Xl},[$Xi]             @ load [rotated] Xi
 187                                                 @ "[rotated]" means that
 188                                                 @ loaded value would have
 189                                                 @ to be rotated in order to
 190                                                 @ make it appear as in
 191                                                 @ alorithm specification
 192         subs            $len,$len,#32           @ see if $len is 32 or larger
 193         mov             $inc,#16                @ $inc is used as post-
 194                                                 @ increment for input pointer;
 195                                                 @ as loop is modulo-scheduled
 196                                                 @ $inc is zeroed just in time
 197                                                 @ to preclude oversteping
 198                                                 @ inp[len], which means that
 199                                                 @ last block[s] are actually
 200                                                 @ loaded twice, but last
 201                                                 @ copy is not processed
 202         vld1.64         {$H-$Hhl},[$Htbl],#32   @ load twisted H, ..., H^2
 203         vmov.i8         $xC2,#0xe1
 204         vld1.64         {$H2},[$Htbl]
 205         cclr            $inc,eq                 @ is it time to zero $inc?
 206         vext.8          $Xl,$Xl,$Xl,#8          @ rotate Xi
 207         vld1.64         {$t0},[$inp],#16        @ load [rotated] I[0]
 208         vshl.u64        $xC2,$xC2,#57           @ compose 0xc2.0 constant
 209 #ifndef __ARMEB__
 210         vrev64.8        $t0,$t0
 211         vrev64.8        $Xl,$Xl
 212 #endif
 213         vext.8          $IN,$t0,$t0,#8          @ rotate I[0]
 214         b.lo            .Lodd_tail_v8           @ $len was less than 32
 215 ___
 216 { my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
 217         #######
 218         # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
 219         #       [(H*Ii+1) + (H*Xi+1)] mod P =
 220         #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
 221         #
 222 $code.=<<___;
 223         vld1.64         {$t1},[$inp],$inc       @ load [rotated] I[1]
 224 #ifndef __ARMEB__
 225         vrev64.8        $t1,$t1
 226 #endif
 227         vext.8          $In,$t1,$t1,#8
 228         veor            $IN,$IN,$Xl             @ I[i]^=Xi
 229         vpmull.p64      $Xln,$H,$In             @ H·Ii+1
 230         veor            $t1,$t1,$In             @ Karatsuba pre-processing
 231         vpmull2.p64     $Xhn,$H,$In
 232         b               .Loop_mod2x_v8
 233
 234 .align  4
 235 .Loop_mod2x_v8:
 236         vext.8          $t2,$IN,$IN,#8
 237         subs            $len,$len,#32           @ is there more data?
 238         vpmull.p64      $Xl,$H2,$IN             @ H^2.lo·Xi.lo
 239         cclr            $inc,lo                 @ is it time to zero $inc?
 240
 241          vpmull.p64     $Xmn,$Hhl,$t1
 242         veor            $t2,$t2,$IN             @ Karatsuba pre-processing
 243         vpmull2.p64     $Xh,$H2,$IN             @ H^2.hi·Xi.hi
 244         veor            $Xl,$Xl,$Xln            @ accumulate
 245         vpmull2.p64     $Xm,$Hhl,$t2            @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
 246          vld1.64        {$t0},[$inp],$inc       @ load [rotated] I[i+2]
 247
 248         veor            $Xh,$Xh,$Xhn
 249          cclr           $inc,eq                 @ is it time to zero $inc?
 250         veor            $Xm,$Xm,$Xmn
 251
 252         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
 253         veor            $t2,$Xl,$Xh
 254         veor            $Xm,$Xm,$t1
 255          vld1.64        {$t1},[$inp],$inc       @ load [rotated] I[i+3]
 256 #ifndef __ARMEB__
 257          vrev64.8       $t0,$t0
 258 #endif
 259         veor            $Xm,$Xm,$t2
 260         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
 261
 262 #ifndef __ARMEB__
 263          vrev64.8       $t1,$t1
 264 #endif
 265         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
 266         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
 267          vext.8         $In,$t1,$t1,#8
 268          vext.8         $IN,$t0,$t0,#8
 269         veor            $Xl,$Xm,$t2
 270          vpmull.p64     $Xln,$H,$In             @ H·Ii+1
 271         veor            $IN,$IN,$Xh             @ accumulate $IN early
 272
 273         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
 274         vpmull.p64      $Xl,$Xl,$xC2
 275         veor            $IN,$IN,$t2
 276          veor           $t1,$t1,$In             @ Karatsuba pre-processing
 277         veor            $IN,$IN,$Xl
 278          vpmull2.p64    $Xhn,$H,$In
 279         b.hs            .Loop_mod2x_v8          @ there was at least 32 more bytes
 280
 281         veor            $Xh,$Xh,$t2
 282         vext.8          $IN,$t0,$t0,#8          @ re-construct $IN
 283         adds            $len,$len,#32           @ re-construct $len
 284         veor            $Xl,$Xl,$Xh             @ re-construct $Xl
 285         b.eq            .Ldone_v8               @ is $len zero?
 286 ___
 287 }
 288 $code.=<<___;
 289 .Lodd_tail_v8:
 290         vext.8          $t2,$Xl,$Xl,#8
 291         veor            $IN,$IN,$Xl             @ inp^=Xi
 292         veor            $t1,$t0,$t2             @ $t1 is rotated inp^Xi
 293
 294         vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
 295         veor            $t1,$t1,$IN             @ Karatsuba pre-processing
 296         vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
 297         vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
 298
 299         vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
 300         veor            $t2,$Xl,$Xh
 301         veor            $Xm,$Xm,$t1
 302         veor            $Xm,$Xm,$t2
 303         vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
 304
 305         vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
 306         vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
 307         veor            $Xl,$Xm,$t2
 308
 309         vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
 310         vpmull.p64      $Xl,$Xl,$xC2
 311         veor            $t2,$t2,$Xh
 312         veor            $Xl,$Xl,$t2
 313
 314 .Ldone_v8:
 315 #ifndef __ARMEB__
 316         vrev64.8        $Xl,$Xl
 317 #endif
 318         vext.8          $Xl,$Xl,$Xl,#8
 319         vst1.64         {$Xl},[$Xi]             @ write out Xi
 320
 321 ___
 322 $code.=<<___            if ($flavour !~ /64/);
 323         vldmia          sp!,{d8-d15}            @ 32-bit ABI says so
 324 ___
 325 $code.=<<___;
 326         ret
 327 .size   gcm_ghash_v8,.-gcm_ghash_v8
 328 ___
 329 }
 330 $code.=<<___;
 331 .asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 332 .align  2
 333 ___
 334
 335 if ($flavour =~ /64/) {                 ######## 64-bit code
 336     sub unvmov {
 337         my $arg=shift;
 338
 339         $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
 340         sprintf "ins    v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
 341     }
 342     foreach(split("\n",$code)) {
 343         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
 344         s/vmov\.i8/movi/o               or      # fix up legacy mnemonics
 345         s/vmov\s+(.*)/unvmov($1)/geo    or
 346         s/vext\.8/ext/o                 or
 347         s/vshr\.s/sshr\.s/o             or
 348         s/vshr/ushr/o                   or
 349         s/^(\s+)v/$1/o                  or      # strip off v prefix
 350         s/\bbx\s+lr\b/ret/o;
 351
 352         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
 353         s/@\s/\/\//o;                           # old->new style commentary
 354
 355         # fix up remainig legacy suffixes
 356         s/\.[ui]?8(\s)/$1/o;
 357         s/\.[uis]?32//o and s/\.16b/\.4s/go;
 358         m/\.p64/o and s/\.16b/\.1q/o;           # 1st pmull argument
 359         m/l\.p64/o and s/\.16b/\.1d/go;         # 2nd and 3rd pmull arguments
 360         s/\.[uisp]?64//o and s/\.16b/\.2d/go;
 361         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
 362
 363         print $_,"\n";
 364     }
 365 } else {                                ######## 32-bit code
 366     sub unvdup32 {
 367         my $arg=shift;
 368
 369         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
 370         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
 371     }
 372     sub unvpmullp64 {
 373         my ($mnemonic,$arg)=@_;
 374
 375         if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
 376             my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
 377                                  |(($2&7)<<17)|(($2&8)<<4)
 378                                  |(($3&7)<<1) |(($3&8)<<2);
 379             $word |= 0x00010001  if ($mnemonic =~ "2");
 380             # since ARMv7 instructions are always encoded little-endian.
 381             # correct solution is to use .inst directive, but older
 382             # assemblers don't implement it:-(
 383             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
 384                         $word&0xff,($word>>8)&0xff,
 385                         ($word>>16)&0xff,($word>>24)&0xff,
 386                         $mnemonic,$arg;
 387         }
 388     }
 389
 390     foreach(split("\n",$code)) {
 391         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
 392         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
 393         s/\/\/\s?/@ /o;                         # new->old style commentary
 394
 395         # fix up remainig new-style suffixes
 396         s/\],#[0-9]+/]!/o;
 397
 398         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o                 or
 399         s/vdup\.32\s+(.*)/unvdup32($1)/geo                              or
 400         s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo                or
 401         s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
 402         s/^(\s+)b\./$1b/o                                               or
 403         s/^(\s+)ret/$1bx\tlr/o;
 404
 405         print $_,"\n";
 406     }
 407 }
 408
 409 close STDOUT; # enforce flush