OpenSSL 1.0.2g
[tomato.git] / release / src / router / openssl / crypto / bn / asm / alpha-mont.pl
blob03596e2014d4035d02440316cc1cf025f726d772
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # On 21264 RSA sign performance improves by 70/35/20/15 percent for
11 # 512/1024/2048/4096 bit key lengths. This is against vendor compiler
12 # instructed to '-tune host' code with in-line assembler. Other
13 # benchmarks improve by 15-20%. To anchor it to something else, the
14 # code provides approximately the same performance per GHz as AMD64.
15 # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
16 # difference.
18 # int bn_mul_mont(
19 $rp="a0"; # BN_ULONG *rp,
20 $ap="a1"; # const BN_ULONG *ap,
21 $bp="a2"; # const BN_ULONG *bp,
22 $np="a3"; # const BN_ULONG *np,
23 $n0="a4"; # const BN_ULONG *n0,
24 $num="a5"; # int num);
26 $lo0="t0";
27 $hi0="t1";
28 $lo1="t2";
29 $hi1="t3";
30 $aj="t4";
31 $bi="t5";
32 $nj="t6";
33 $tp="t7";
34 $alo="t8";
35 $ahi="t9";
36 $nlo="t10";
37 $nhi="t11";
38 $tj="t12";
39 $i="s3";
40 $j="s4";
41 $m1="s5";
43 $code=<<___;
44 #ifdef __linux__
45 #include <asm/regdef.h>
46 #else
47 #include <asm.h>
48 #include <regdef.h>
49 #endif
51 .text
53 .set noat
54 .set noreorder
56 .globl bn_mul_mont
57 .align 5
58 .ent bn_mul_mont
59 bn_mul_mont:
60 lda sp,-48(sp)
61 stq ra,0(sp)
62 stq s3,8(sp)
63 stq s4,16(sp)
64 stq s5,24(sp)
65 stq fp,32(sp)
66 mov sp,fp
67 .mask 0x0400f000,-48
68 .frame fp,48,ra
69 .prologue 0
71 .align 4
72 .set reorder
73 sextl $num,$num
74 mov 0,v0
75 cmplt $num,4,AT
76 bne AT,.Lexit
78 ldq $hi0,0($ap) # ap[0]
79 s8addq $num,16,AT
80 ldq $aj,8($ap)
81 subq sp,AT,sp
82 ldq $bi,0($bp) # bp[0]
83 lda AT,-4096(zero) # mov -4096,AT
84 ldq $n0,0($n0)
85 and sp,AT,sp
87 mulq $hi0,$bi,$lo0
88 ldq $hi1,0($np) # np[0]
89 umulh $hi0,$bi,$hi0
90 ldq $nj,8($np)
92 mulq $lo0,$n0,$m1
94 mulq $hi1,$m1,$lo1
95 umulh $hi1,$m1,$hi1
97 addq $lo1,$lo0,$lo1
98 cmpult $lo1,$lo0,AT
99 addq $hi1,AT,$hi1
101 mulq $aj,$bi,$alo
102 mov 2,$j
103 umulh $aj,$bi,$ahi
104 mov sp,$tp
106 mulq $nj,$m1,$nlo
107 s8addq $j,$ap,$aj
108 umulh $nj,$m1,$nhi
109 s8addq $j,$np,$nj
110 .align 4
111 .L1st:
112 .set noreorder
113 ldq $aj,0($aj)
114 addl $j,1,$j
115 ldq $nj,0($nj)
116 lda $tp,8($tp)
118 addq $alo,$hi0,$lo0
119 mulq $aj,$bi,$alo
120 cmpult $lo0,$hi0,AT
121 addq $nlo,$hi1,$lo1
123 mulq $nj,$m1,$nlo
124 addq $ahi,AT,$hi0
125 cmpult $lo1,$hi1,v0
126 cmplt $j,$num,$tj
128 umulh $aj,$bi,$ahi
129 addq $nhi,v0,$hi1
130 addq $lo1,$lo0,$lo1
131 s8addq $j,$ap,$aj
133 umulh $nj,$m1,$nhi
134 cmpult $lo1,$lo0,v0
135 addq $hi1,v0,$hi1
136 s8addq $j,$np,$nj
138 stq $lo1,-8($tp)
140 unop
141 bne $tj,.L1st
142 .set reorder
144 addq $alo,$hi0,$lo0
145 addq $nlo,$hi1,$lo1
146 cmpult $lo0,$hi0,AT
147 cmpult $lo1,$hi1,v0
148 addq $ahi,AT,$hi0
149 addq $nhi,v0,$hi1
151 addq $lo1,$lo0,$lo1
152 cmpult $lo1,$lo0,v0
153 addq $hi1,v0,$hi1
155 stq $lo1,0($tp)
157 addq $hi1,$hi0,$hi1
158 cmpult $hi1,$hi0,AT
159 stq $hi1,8($tp)
160 stq AT,16($tp)
162 mov 1,$i
163 .align 4
164 .Louter:
165 s8addq $i,$bp,$bi
166 ldq $hi0,0($ap)
167 ldq $aj,8($ap)
168 ldq $bi,0($bi)
169 ldq $hi1,0($np)
170 ldq $nj,8($np)
171 ldq $tj,0(sp)
173 mulq $hi0,$bi,$lo0
174 umulh $hi0,$bi,$hi0
176 addq $lo0,$tj,$lo0
177 cmpult $lo0,$tj,AT
178 addq $hi0,AT,$hi0
180 mulq $lo0,$n0,$m1
182 mulq $hi1,$m1,$lo1
183 umulh $hi1,$m1,$hi1
185 addq $lo1,$lo0,$lo1
186 cmpult $lo1,$lo0,AT
187 mov 2,$j
188 addq $hi1,AT,$hi1
190 mulq $aj,$bi,$alo
191 mov sp,$tp
192 umulh $aj,$bi,$ahi
194 mulq $nj,$m1,$nlo
195 s8addq $j,$ap,$aj
196 umulh $nj,$m1,$nhi
197 .align 4
198 .Linner:
199 .set noreorder
200 ldq $tj,8($tp) #L0
201 nop #U1
202 ldq $aj,0($aj) #L1
203 s8addq $j,$np,$nj #U0
205 ldq $nj,0($nj) #L0
206 nop #U1
207 addq $alo,$hi0,$lo0 #L1
208 lda $tp,8($tp)
210 mulq $aj,$bi,$alo #U1
211 cmpult $lo0,$hi0,AT #L0
212 addq $nlo,$hi1,$lo1 #L1
213 addl $j,1,$j
215 mulq $nj,$m1,$nlo #U1
216 addq $ahi,AT,$hi0 #L0
217 addq $lo0,$tj,$lo0 #L1
218 cmpult $lo1,$hi1,v0 #U0
220 umulh $aj,$bi,$ahi #U1
221 cmpult $lo0,$tj,AT #L0
222 addq $lo1,$lo0,$lo1 #L1
223 addq $nhi,v0,$hi1 #U0
225 umulh $nj,$m1,$nhi #U1
226 s8addq $j,$ap,$aj #L0
227 cmpult $lo1,$lo0,v0 #L1
228 cmplt $j,$num,$tj #U0 # borrow $tj
230 addq $hi0,AT,$hi0 #L0
231 addq $hi1,v0,$hi1 #U1
232 stq $lo1,-8($tp) #L1
233 bne $tj,.Linner #U0
234 .set reorder
236 ldq $tj,8($tp)
237 addq $alo,$hi0,$lo0
238 addq $nlo,$hi1,$lo1
239 cmpult $lo0,$hi0,AT
240 cmpult $lo1,$hi1,v0
241 addq $ahi,AT,$hi0
242 addq $nhi,v0,$hi1
244 addq $lo0,$tj,$lo0
245 cmpult $lo0,$tj,AT
246 addq $hi0,AT,$hi0
248 ldq $tj,16($tp)
249 addq $lo1,$lo0,$j
250 cmpult $j,$lo0,v0
251 addq $hi1,v0,$hi1
253 addq $hi1,$hi0,$lo1
254 stq $j,0($tp)
255 cmpult $lo1,$hi0,$hi1
256 addq $lo1,$tj,$lo1
257 cmpult $lo1,$tj,AT
258 addl $i,1,$i
259 addq $hi1,AT,$hi1
260 stq $lo1,8($tp)
261 cmplt $i,$num,$tj # borrow $tj
262 stq $hi1,16($tp)
263 bne $tj,.Louter
265 s8addq $num,sp,$tj # &tp[num]
266 mov $rp,$bp # put rp aside
267 mov sp,$tp
268 mov sp,$ap
269 mov 0,$hi0 # clear borrow bit
271 .align 4
272 .Lsub: ldq $lo0,0($tp)
273 ldq $lo1,0($np)
274 lda $tp,8($tp)
275 lda $np,8($np)
276 subq $lo0,$lo1,$lo1 # tp[i]-np[i]
277 cmpult $lo0,$lo1,AT
278 subq $lo1,$hi0,$lo0
279 cmpult $lo1,$lo0,$hi0
280 or $hi0,AT,$hi0
281 stq $lo0,0($rp)
282 cmpult $tp,$tj,v0
283 lda $rp,8($rp)
284 bne v0,.Lsub
286 subq $hi1,$hi0,$hi0 # handle upmost overflow bit
287 mov sp,$tp
288 mov $bp,$rp # restore rp
290 and sp,$hi0,$ap
291 bic $bp,$hi0,$bp
292 bis $bp,$ap,$ap # ap=borrow?tp:rp
294 .align 4
295 .Lcopy: ldq $aj,0($ap) # copy or in-place refresh
296 lda $tp,8($tp)
297 lda $rp,8($rp)
298 lda $ap,8($ap)
299 stq zero,-8($tp) # zap tp
300 cmpult $tp,$tj,AT
301 stq $aj,-8($rp)
302 bne AT,.Lcopy
303 mov 1,v0
305 .Lexit:
306 .set noreorder
307 mov fp,sp
308 /*ldq ra,0(sp)*/
309 ldq s3,8(sp)
310 ldq s4,16(sp)
311 ldq s5,24(sp)
312 ldq fp,32(sp)
313 lda sp,48(sp)
314 ret (ra)
315 .end bn_mul_mont
316 .ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
317 .align 2
320 print $code;
321 close STDOUT;