3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # On 21264 RSA sign performance improves by 70/35/20/15 percent for
11 # 512/1024/2048/4096 bit key lengths. This is against vendor compiler
12 # instructed to '-tune host' code with in-line assembler. Other
13 # benchmarks improve by 15-20%. To anchor it to something else, the
14 # code provides approximately the same performance per GHz as AMD64.
15 # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
19 $rp="a0"; # BN_ULONG *rp,
20 $ap="a1"; # const BN_ULONG *ap,
21 $bp="a2"; # const BN_ULONG *bp,
22 $np="a3"; # const BN_ULONG *np,
23 $n0="a4"; # const BN_ULONG *n0,
24 $num="a5"; # int num);
45 #include <asm/regdef.h>
78 ldq
$hi0,0($ap) # ap[0]
82 ldq
$bi,0($bp) # bp[0]
83 lda AT
,-4096(zero
) # mov -4096,AT
88 ldq
$hi1,0($np) # np[0]
203 s8addq
$j,$np,$nj #U0
207 addq
$alo,$hi0,$lo0 #L1
210 mulq
$aj,$bi,$alo #U1
211 cmpult
$lo0,$hi0,AT
#L0
212 addq
$nlo,$hi1,$lo1 #L1
215 mulq
$nj,$m1,$nlo #U1
216 addq
$ahi,AT
,$hi0 #L0
217 addq
$lo0,$tj,$lo0 #L1
218 cmpult
$lo1,$hi1,v0
#U0
220 umulh
$aj,$bi,$ahi #U1
221 cmpult
$lo0,$tj,AT
#L0
222 addq
$lo1,$lo0,$lo1 #L1
223 addq
$nhi,v0
,$hi1 #U0
225 umulh
$nj,$m1,$nhi #U1
226 s8addq
$j,$ap,$aj #L0
227 cmpult
$lo1,$lo0,v0
#L1
228 cmplt
$j,$num,$tj #U0 # borrow $tj
230 addq
$hi0,AT
,$hi0 #L0
231 addq
$hi1,v0
,$hi1 #U1
255 cmpult
$lo1,$hi0,$hi1
261 cmplt
$i,$num,$tj # borrow $tj
265 s8addq
$num,sp
,$tj # &tp[num]
266 mov
$rp,$bp # put rp aside
269 mov
0,$hi0 # clear borrow bit
272 .Lsub
: ldq
$lo0,0($tp)
276 subq
$lo0,$lo1,$lo1 # tp[i]-np[i]
279 cmpult
$lo1,$lo0,$hi0
286 subq
$hi1,$hi0,$hi0 # handle upmost overflow bit
288 mov
$bp,$rp # restore rp
292 bis
$bp,$ap,$ap # ap=borrow?tp:rp
295 .Lcopy
: ldq
$aj,0($ap) # copy or in-place refresh
299 stq zero
,-8($tp) # zap tp
316 .ascii
"Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"