import libcrypto (LibreSSL 2.5.2)
[unleashed.git] / lib / libcrypto / bn / asm / alpha-mont.pl
blob41700d5bd58b88fd4c50e5b3965ede49796e95f4
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # On 21264 RSA sign performance improves by 70/35/20/15 percent for
11 # 512/1024/2048/4096 bit key lengths. This is against vendor compiler
12 # instructed to '-tune host' code with in-line assembler. Other
13 # benchmarks improve by 15-20%. To anchor it to something else, the
14 # code provides approximately the same performance per GHz as AMD64.
15 # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
16 # difference.
18 # int bn_mul_mont(
19 $rp="a0"; # BN_ULONG *rp,
20 $ap="a1"; # const BN_ULONG *ap,
21 $bp="a2"; # const BN_ULONG *bp,
22 $np="a3"; # const BN_ULONG *np,
23 $n0="a4"; # const BN_ULONG *n0,
24 $num="a5"; # int num);
26 $lo0="t0";
27 $hi0="t1";
28 $lo1="t2";
29 $hi1="t3";
30 $aj="t4";
31 $bi="t5";
32 $nj="t6";
33 $tp="t7";
34 $alo="t8";
35 $ahi="t9";
36 $nlo="t10";
37 $nhi="t11";
38 $tj="t12";
39 $i="s3";
40 $j="s4";
41 $m1="s5";
43 $code=<<___;
44 #include <machine/asm.h>
46 .text
48 .set noat
49 .set noreorder
51 .globl bn_mul_mont
52 .align 5
53 .ent bn_mul_mont
54 bn_mul_mont:
55 lda sp,-48(sp)
56 stq ra,0(sp)
57 stq s3,8(sp)
58 stq s4,16(sp)
59 stq s5,24(sp)
60 stq fp,32(sp)
61 mov sp,fp
62 .mask 0x0400f000,-48
63 .frame fp,48,ra
64 .prologue 0
66 .align 4
67 .set reorder
68 sextl $num,$num
69 mov 0,v0
70 cmplt $num,4,AT
71 bne AT,.Lexit
73 ldq $hi0,0($ap) # ap[0]
74 s8addq $num,16,AT
75 ldq $aj,8($ap)
76 subq sp,AT,sp
77 ldq $bi,0($bp) # bp[0]
78 lda AT,-4096(zero) # mov -4096,AT
79 ldq $n0,0($n0)
80 and sp,AT,sp
82 mulq $hi0,$bi,$lo0
83 ldq $hi1,0($np) # np[0]
84 umulh $hi0,$bi,$hi0
85 ldq $nj,8($np)
87 mulq $lo0,$n0,$m1
89 mulq $hi1,$m1,$lo1
90 umulh $hi1,$m1,$hi1
92 addq $lo1,$lo0,$lo1
93 cmpult $lo1,$lo0,AT
94 addq $hi1,AT,$hi1
96 mulq $aj,$bi,$alo
97 mov 2,$j
98 umulh $aj,$bi,$ahi
99 mov sp,$tp
101 mulq $nj,$m1,$nlo
102 s8addq $j,$ap,$aj
103 umulh $nj,$m1,$nhi
104 s8addq $j,$np,$nj
105 .align 4
106 .L1st:
107 .set noreorder
108 ldq $aj,0($aj)
109 addl $j,1,$j
110 ldq $nj,0($nj)
111 lda $tp,8($tp)
113 addq $alo,$hi0,$lo0
114 mulq $aj,$bi,$alo
115 cmpult $lo0,$hi0,AT
116 addq $nlo,$hi1,$lo1
118 mulq $nj,$m1,$nlo
119 addq $ahi,AT,$hi0
120 cmpult $lo1,$hi1,v0
121 cmplt $j,$num,$tj
123 umulh $aj,$bi,$ahi
124 addq $nhi,v0,$hi1
125 addq $lo1,$lo0,$lo1
126 s8addq $j,$ap,$aj
128 umulh $nj,$m1,$nhi
129 cmpult $lo1,$lo0,v0
130 addq $hi1,v0,$hi1
131 s8addq $j,$np,$nj
133 stq $lo1,-8($tp)
135 unop
136 bne $tj,.L1st
137 .set reorder
139 addq $alo,$hi0,$lo0
140 addq $nlo,$hi1,$lo1
141 cmpult $lo0,$hi0,AT
142 cmpult $lo1,$hi1,v0
143 addq $ahi,AT,$hi0
144 addq $nhi,v0,$hi1
146 addq $lo1,$lo0,$lo1
147 cmpult $lo1,$lo0,v0
148 addq $hi1,v0,$hi1
150 stq $lo1,0($tp)
152 addq $hi1,$hi0,$hi1
153 cmpult $hi1,$hi0,AT
154 stq $hi1,8($tp)
155 stq AT,16($tp)
157 mov 1,$i
158 .align 4
159 .Louter:
160 s8addq $i,$bp,$bi
161 ldq $hi0,0($ap)
162 ldq $aj,8($ap)
163 ldq $bi,0($bi)
164 ldq $hi1,0($np)
165 ldq $nj,8($np)
166 ldq $tj,0(sp)
168 mulq $hi0,$bi,$lo0
169 umulh $hi0,$bi,$hi0
171 addq $lo0,$tj,$lo0
172 cmpult $lo0,$tj,AT
173 addq $hi0,AT,$hi0
175 mulq $lo0,$n0,$m1
177 mulq $hi1,$m1,$lo1
178 umulh $hi1,$m1,$hi1
180 addq $lo1,$lo0,$lo1
181 cmpult $lo1,$lo0,AT
182 mov 2,$j
183 addq $hi1,AT,$hi1
185 mulq $aj,$bi,$alo
186 mov sp,$tp
187 umulh $aj,$bi,$ahi
189 mulq $nj,$m1,$nlo
190 s8addq $j,$ap,$aj
191 umulh $nj,$m1,$nhi
192 .align 4
193 .Linner:
194 .set noreorder
195 ldq $tj,8($tp) #L0
196 nop #U1
197 ldq $aj,0($aj) #L1
198 s8addq $j,$np,$nj #U0
200 ldq $nj,0($nj) #L0
201 nop #U1
202 addq $alo,$hi0,$lo0 #L1
203 lda $tp,8($tp)
205 mulq $aj,$bi,$alo #U1
206 cmpult $lo0,$hi0,AT #L0
207 addq $nlo,$hi1,$lo1 #L1
208 addl $j,1,$j
210 mulq $nj,$m1,$nlo #U1
211 addq $ahi,AT,$hi0 #L0
212 addq $lo0,$tj,$lo0 #L1
213 cmpult $lo1,$hi1,v0 #U0
215 umulh $aj,$bi,$ahi #U1
216 cmpult $lo0,$tj,AT #L0
217 addq $lo1,$lo0,$lo1 #L1
218 addq $nhi,v0,$hi1 #U0
220 umulh $nj,$m1,$nhi #U1
221 s8addq $j,$ap,$aj #L0
222 cmpult $lo1,$lo0,v0 #L1
223 cmplt $j,$num,$tj #U0 # borrow $tj
225 addq $hi0,AT,$hi0 #L0
226 addq $hi1,v0,$hi1 #U1
227 stq $lo1,-8($tp) #L1
228 bne $tj,.Linner #U0
229 .set reorder
231 ldq $tj,8($tp)
232 addq $alo,$hi0,$lo0
233 addq $nlo,$hi1,$lo1
234 cmpult $lo0,$hi0,AT
235 cmpult $lo1,$hi1,v0
236 addq $ahi,AT,$hi0
237 addq $nhi,v0,$hi1
239 addq $lo0,$tj,$lo0
240 cmpult $lo0,$tj,AT
241 addq $hi0,AT,$hi0
243 ldq $tj,16($tp)
244 addq $lo1,$lo0,$j
245 cmpult $j,$lo0,v0
246 addq $hi1,v0,$hi1
248 addq $hi1,$hi0,$lo1
249 stq $j,0($tp)
250 cmpult $lo1,$hi0,$hi1
251 addq $lo1,$tj,$lo1
252 cmpult $lo1,$tj,AT
253 addl $i,1,$i
254 addq $hi1,AT,$hi1
255 stq $lo1,8($tp)
256 cmplt $i,$num,$tj # borrow $tj
257 stq $hi1,16($tp)
258 bne $tj,.Louter
260 s8addq $num,sp,$tj # &tp[num]
261 mov $rp,$bp # put rp aside
262 mov sp,$tp
263 mov sp,$ap
264 mov 0,$hi0 # clear borrow bit
266 .align 4
267 .Lsub: ldq $lo0,0($tp)
268 ldq $lo1,0($np)
269 lda $tp,8($tp)
270 lda $np,8($np)
271 subq $lo0,$lo1,$lo1 # tp[i]-np[i]
272 cmpult $lo0,$lo1,AT
273 subq $lo1,$hi0,$lo0
274 cmpult $lo1,$lo0,$hi0
275 or $hi0,AT,$hi0
276 stq $lo0,0($rp)
277 cmpult $tp,$tj,v0
278 lda $rp,8($rp)
279 bne v0,.Lsub
281 subq $hi1,$hi0,$hi0 # handle upmost overflow bit
282 mov sp,$tp
283 mov $bp,$rp # restore rp
285 and sp,$hi0,$ap
286 bic $bp,$hi0,$bp
287 bis $bp,$ap,$ap # ap=borrow?tp:rp
289 .align 4
290 .Lcopy: ldq $aj,0($ap) # copy or in-place refresh
291 lda $tp,8($tp)
292 lda $rp,8($rp)
293 lda $ap,8($ap)
294 stq zero,-8($tp) # zap tp
295 cmpult $tp,$tj,AT
296 stq $aj,-8($rp)
297 bne AT,.Lcopy
298 mov 1,v0
300 .Lexit:
301 .set noreorder
302 mov fp,sp
303 /*ldq ra,0(sp)*/
304 ldq s3,8(sp)
305 ldq s4,16(sp)
306 ldq s5,24(sp)
307 ldq fp,32(sp)
308 lda sp,48(sp)
309 ret (ra)
310 .end bn_mul_mont
311 .ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
312 .align 2
315 print $code;
316 close STDOUT;