2 ; PA-RISC
64-bit implementation of bn_asm code
4 ; This code is approximately
2x faster than the C version
7 ; See http
://devresource.hp.com
/ for more details on the PA-RISC
8 ; architecture. Also see the book
"PA-RISC 2.0 Architecture"
9 ; by Gerry Kane for information on the instruction set architecture.
11 ; Code written by Chris Ruemmler
(with some help from the HP C
14 ; The code compiles with HP
's assembler
19 .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
22 ; Global Register definitions used for the routines.
24 ; Some information about HP's runtime architecture for
64-bits.
26 ;
"Caller save" means the calling function must save the register
27 ; if it wants the register to
be preserved.
28 ;
"Callee save" means if
a function uses the register
, it must save
29 ; the value before using it.
31 ; For the floating point registers
33 ;
"caller save" registers
: fr4-fr11
, fr22-fr31
34 ;
"callee save" registers
: fr12-fr21
35 ;
"special" registers
: fr0-fr3
(status
and exception registers
)
37 ; For the integer registers
39 ;
"caller save" registers
: r1,r19-
r26
40 ;
"callee save" registers
: r3-
r18
41 ; return register
: r2 (rp
)
42 ; return values ;
r28 (ret0
,ret1
)
43 ; Stack pointer ;
r30 (sp
)
44 ; global data pointer ;
r27 (dp
)
45 ; argument pointer ;
r29 (ap
)
46 ; millicode return ptr ;
r31 (also
a caller save register
)
50 ; Arguments to the routines
61 ; Globals used in some routines
64 top_overflow
.reg %r29
65 high_mask
.reg %r22 ; value 0xffffffff80000000L
68 ;
------------------------------------------------------------------------------
72 ;BN_ULONG bn_mul_add_words
(BN_ULONG
*r_ptr
, BN_ULONG
*a_ptr
,
73 ; int num
, BN_ULONG w
)
80 ; Local register definitions
118 .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
124 STD %r3,0(%sp
) ; save
r3
125 STD %r4,8(%sp
) ; save
r4
126 NOP ; Needed to make the loop
16-byte aligned
127 NOP ; Needed to make the loop
16-byte aligned
129 STD %r5,16(%sp
) ; save
r5
130 STD %r6,24(%sp
) ; save
r6
131 STD %r7,32(%sp
) ; save
r7
132 STD %r8,40(%sp
) ; save
r8
134 STD %r9,48(%sp
) ; save
r9
135 COPY
%r0,%ret0 ; return
0 by default
136 DEPDI
,Z
1,31,1,top_overflow ; top_overflow
= 1 << 32
137 STD w
,56(%sp
) ; store w on stack
139 CMPIB
,>= 0,num
,bn_mul_add_words_exit ; if
(num
<= 0) then exit
140 LDO
128(%sp
),%sp ; bump stack
143 ; The loop is unrolled twice
, so if there is only
1 number
144 ; then go straight to the cleanup code.
146 CMPIB
,= 1,num
,bn_mul_add_words_single_top
147 FLDD
-72(%sp
),fw ; load up w into fp register fw
(fw_h
/fw_l
)
150 ; This loop is unrolled
2 times
(64-byte aligned as well
)
152 ; PA-RISC
2.0 chips have two fully pipelined multipliers
, thus
153 ; two
32-bit mutiplies can
be issued per cycle.
155 bn_mul_add_words_unroll2
157 FLDD
0(a_ptr
),t_float_0 ; load up
64-bit value
(fr8L
) ht
(L)/lt
(R
)
158 FLDD
8(a_ptr
),t_float_1 ; load up
64-bit value
(fr8L
) ht
(L)/lt
(R
)
159 LDD
0(r_ptr
),rp_val ; rp
[0]
160 LDD
8(r_ptr
),rp_val_1 ; rp
[1]
162 XMPYU fht_0
,fw_l
,fm1 ; m1
[0] = fht_0
*fw_l
163 XMPYU fht_1
,fw_l
,fm1_1 ; m1
[1] = fht_1
*fw_l
164 FSTD fm1
,-16(%sp
) ;
-16(sp
) = m1
[0]
165 FSTD fm1_1
,-48(%sp
) ;
-48(sp
) = m1
[1]
167 XMPYU flt_0
,fw_h
,fm ; m
[0] = flt_0
*fw_h
168 XMPYU flt_1
,fw_h
,fm_1 ; m
[1] = flt_1
*fw_h
169 FSTD
fm,-8(%sp
) ;
-8(sp
) = m
[0]
170 FSTD fm_1
,-40(%sp
) ;
-40(sp
) = m
[1]
172 XMPYU fht_0
,fw_h
,ht_temp ; ht_temp
= fht_0
*fw_h
173 XMPYU fht_1
,fw_h
,ht_temp_1 ; ht_temp_1
= fht_1
*fw_h
174 FSTD ht_temp
,-24(%sp
) ;
-24(sp
) = ht_temp
175 FSTD ht_temp_1
,-56(%sp
) ;
-56(sp
) = ht_temp_1
177 XMPYU flt_0
,fw_l
,lt_temp ; lt_temp
= lt
*fw_l
178 XMPYU flt_1
,fw_l
,lt_temp_1 ; lt_temp
= lt
*fw_l
179 FSTD lt_temp
,-32(%sp
) ;
-32(sp
) = lt_temp
180 FSTD lt_temp_1
,-64(%sp
) ;
-64(sp
) = lt_temp_1
182 LDD
-8(%sp
),m_0 ; m
[0]
183 LDD
-40(%sp
),m_1 ; m
[1]
184 LDD
-16(%sp
),m1_0 ; m1
[0]
185 LDD
-48(%sp
),m1_1 ; m1
[1]
187 LDD
-24(%sp
),ht_0 ; ht
[0]
188 LDD
-56(%sp
),ht_1 ; ht
[1]
189 ADD,L m1_0
,m_0
,tmp_0 ; tmp_0
= m
[0] + m1
[0];
190 ADD,L m1_1
,m_1
,tmp_1 ; tmp_1
= m
[1] + m1
[1];
194 CMPCLR
,*>>= tmp_0
,m1_0
, %r0 ; if
(m
[0] < m1
[0])
195 ADD,L ht_0
,top_overflow
,ht_0 ; ht
[0] += (1<<32)
197 CMPCLR
,*>>= tmp_1
,m1_1
,%r0 ; if
(m
[1] < m1
[1])
198 ADD,L ht_1
,top_overflow
,ht_1 ; ht
[1] += (1<<32)
199 EXTRD
,U tmp_0
,31,32,m_0 ; m
[0]>>32
200 DEPD
,Z tmp_0
,31,32,m1_0 ; m1
[0] = m
[0]<<32
202 EXTRD
,U tmp_1
,31,32,m_1 ; m
[1]>>32
203 DEPD
,Z tmp_1
,31,32,m1_1 ; m1
[1] = m
[1]<<32
204 ADD,L ht_0
,m_0
,ht_0 ; ht
[0]+= (m
[0]>>32)
205 ADD,L ht_1
,m_1
,ht_1 ; ht
[1]+= (m
[1]>>32)
207 ADD lt_0
,m1_0
,lt_0 ; lt
[0] = lt
[0]+m1
[0];
208 ADD,DC ht_0
,%r0,ht_0 ; ht
[0]++
209 ADD lt_1
,m1_1
,lt_1 ; lt
[1] = lt
[1]+m1
[1];
210 ADD,DC ht_1
,%r0,ht_1 ; ht
[1]++
212 ADD %ret0
,lt_0
,lt_0 ; lt
[0] = lt
[0] + c;
213 ADD,DC ht_0
,%r0,ht_0 ; ht
[0]++
214 ADD lt_0
,rp_val
,lt_0 ; lt
[0] = lt
[0]+rp
[0]
215 ADD,DC ht_0
,%r0,ht_0 ; ht
[0]++
217 LDO
-2(num
),num ; num
= num
- 2;
218 ADD ht_0
,lt_1
,lt_1 ; lt
[1] = lt
[1] + ht_0
(c
);
219 ADD,DC ht_1
,%r0,ht_1 ; ht
[1]++
220 STD lt_0
,0(r_ptr
) ; rp
[0] = lt
[0]
222 ADD lt_1
,rp_val_1
,lt_1 ; lt
[1] = lt
[1]+rp
[1]
223 ADD,DC ht_1
,%r0,%ret0 ; ht
[1]++
224 LDO
16(a_ptr
),a_ptr ; a_ptr
+= 2
226 STD lt_1
,8(r_ptr
) ; rp
[1] = lt
[1]
227 CMPIB
,<= 2,num
,bn_mul_add_words_unroll2 ; go again if more to do
228 LDO
16(r_ptr
),r_ptr ; r_ptr
+= 2
230 CMPIB
,=,N
0,num
,bn_mul_add_words_exit ; are we done
, or cleanup last one
233 ; Top of loop aligned on
64-byte boundary
235 bn_mul_add_words_single_top
236 FLDD
0(a_ptr
),t_float_0 ; load up
64-bit value
(fr8L
) ht
(L)/lt
(R
)
237 LDD
0(r_ptr
),rp_val ; rp
[0]
238 LDO
8(a_ptr
),a_ptr ; a_ptr+
+
239 XMPYU fht_0
,fw_l
,fm1 ; m1
= ht
*fw_l
240 FSTD fm1
,-16(%sp
) ;
-16(sp
) = m1
241 XMPYU flt_0
,fw_h
,fm ; m
= lt
*fw_h
242 FSTD
fm,-8(%sp
) ;
-8(sp
) = m
243 XMPYU fht_0
,fw_h
,ht_temp ; ht_temp
= ht
*fw_h
244 FSTD ht_temp
,-24(%sp
) ;
-24(sp
) = ht
245 XMPYU flt_0
,fw_l
,lt_temp ; lt_temp
= lt
*fw_l
246 FSTD lt_temp
,-32(%sp
) ;
-32(sp
) = lt
249 LDD
-16(%sp
),m1_0 ; m1
= temp1
250 ADD,L m_0
,m1_0
,tmp_0 ; tmp_0
= m
+ m1;
254 CMPCLR
,*>>= tmp_0
,m1_0
,%r0 ; if
(m
< m1
)
255 ADD,L ht_0
,top_overflow
,ht_0 ; ht
+= (1<<32)
257 EXTRD
,U tmp_0
,31,32,m_0 ; m
>>32
258 DEPD
,Z tmp_0
,31,32,m1_0 ; m1
= m
<<32
260 ADD,L ht_0
,m_0
,ht_0 ; ht+
= (m
>>32)
261 ADD lt_0
,m1_0
,tmp_0 ; tmp_0
= lt+m1;
262 ADD,DC ht_0
,%r0,ht_0 ; ht+
+
263 ADD %ret0
,tmp_0
,lt_0 ; lt
= lt
+ c;
264 ADD,DC ht_0
,%r0,ht_0 ; ht+
+
265 ADD lt_0
,rp_val
,lt_0 ; lt
= lt+rp
[0]
266 ADD,DC ht_0
,%r0,%ret0 ; ht+
+
267 STD lt_0
,0(r_ptr
) ; rp
[0] = lt
269 bn_mul_add_words_exit
271 LDD
-80(%sp
),%r9 ; restore
r9
272 LDD
-88(%sp
),%r8 ; restore
r8
273 LDD
-96(%sp
),%r7 ; restore
r7
274 LDD
-104(%sp
),%r6 ; restore
r6
275 LDD
-112(%sp
),%r5 ; restore
r5
276 LDD
-120(%sp
),%r4 ; restore
r4
278 LDD
,MB
-128(%sp
),%r3 ; restore
r3
279 .PROCEND ;in=23,24,25,26,29;out=28;
281 ;
----------------------------------------------------------------------------
283 ;BN_ULONG bn_mul_words
(BN_ULONG
*rp
, BN_ULONG
*ap
, int num
, BN_ULONG w
)
294 .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
297 STD %r3,0(%sp
) ; save
r3
298 STD %r4,8(%sp
) ; save
r4
299 STD %r5,16(%sp
) ; save
r5
300 STD %r6,24(%sp
) ; save
r6
302 STD %r7,32(%sp
) ; save
r7
303 COPY
%r0,%ret0 ; return
0 by default
304 DEPDI
,Z
1,31,1,top_overflow ; top_overflow
= 1 << 32
305 STD w
,56(%sp
) ; w on stack
307 CMPIB
,>= 0,num
,bn_mul_words_exit
308 LDO
128(%sp
),%sp ; bump stack
311 ; See if only
1 word to do
, thus just do cleanup
313 CMPIB
,= 1,num
,bn_mul_words_single_top
314 FLDD
-72(%sp
),fw ; load up w into fp register fw
(fw_h
/fw_l
)
317 ; This loop is unrolled
2 times
(64-byte aligned as well
)
319 ; PA-RISC
2.0 chips have two fully pipelined multipliers
, thus
320 ; two
32-bit mutiplies can
be issued per cycle.
324 FLDD
0(a_ptr
),t_float_0 ; load up
64-bit value
(fr8L
) ht
(L)/lt
(R
)
325 FLDD
8(a_ptr
),t_float_1 ; load up
64-bit value
(fr8L
) ht
(L)/lt
(R
)
326 XMPYU fht_0
,fw_l
,fm1 ; m1
[0] = fht_0
*fw_l
327 XMPYU fht_1
,fw_l
,fm1_1 ; m1
[1] = ht
*fw_l
329 FSTD fm1
,-16(%sp
) ;
-16(sp
) = m1
330 FSTD fm1_1
,-48(%sp
) ;
-48(sp
) = m1
331 XMPYU flt_0
,fw_h
,fm ; m
= lt
*fw_h
332 XMPYU flt_1
,fw_h
,fm_1 ; m
= lt
*fw_h
334 FSTD
fm,-8(%sp
) ;
-8(sp
) = m
335 FSTD fm_1
,-40(%sp
) ;
-40(sp
) = m
336 XMPYU fht_0
,fw_h
,ht_temp ; ht_temp
= fht_0
*fw_h
337 XMPYU fht_1
,fw_h
,ht_temp_1 ; ht_temp
= ht
*fw_h
339 FSTD ht_temp
,-24(%sp
) ;
-24(sp
) = ht
340 FSTD ht_temp_1
,-56(%sp
) ;
-56(sp
) = ht
341 XMPYU flt_0
,fw_l
,lt_temp ; lt_temp
= lt
*fw_l
342 XMPYU flt_1
,fw_l
,lt_temp_1 ; lt_temp
= lt
*fw_l
344 FSTD lt_temp
,-32(%sp
) ;
-32(sp
) = lt
345 FSTD lt_temp_1
,-64(%sp
) ;
-64(sp
) = lt
354 ADD,L m1_0
,m_0
,tmp_0 ; tmp_0
= m
+ m1;
355 ADD,L m1_1
,m_1
,tmp_1 ; tmp_1
= m
+ m1;
359 CMPCLR
,*>>= tmp_0
,m1_0
, %r0 ; if
(m
< m1
)
360 ADD,L ht_0
,top_overflow
,ht_0 ; ht
+= (1<<32)
361 CMPCLR
,*>>= tmp_1
,m1_1
,%r0 ; if
(m
< m1
)
362 ADD,L ht_1
,top_overflow
,ht_1 ; ht
+= (1<<32)
364 EXTRD
,U tmp_0
,31,32,m_0 ; m
>>32
365 DEPD
,Z tmp_0
,31,32,m1_0 ; m1
= m
<<32
366 EXTRD
,U tmp_1
,31,32,m_1 ; m
>>32
367 DEPD
,Z tmp_1
,31,32,m1_1 ; m1
= m
<<32
369 ADD,L ht_0
,m_0
,ht_0 ; ht+
= (m
>>32)
370 ADD,L ht_1
,m_1
,ht_1 ; ht+
= (m
>>32)
371 ADD lt_0
,m1_0
,lt_0 ; lt
= lt+m1;
372 ADD,DC ht_0
,%r0,ht_0 ; ht+
+
374 ADD lt_1
,m1_1
,lt_1 ; lt
= lt+m1;
375 ADD,DC ht_1
,%r0,ht_1 ; ht+
+
376 ADD %ret0
,lt_0
,lt_0 ; lt
= lt
+ c
(ret0
);
377 ADD,DC ht_0
,%r0,ht_0 ; ht+
+
379 ADD ht_0
,lt_1
,lt_1 ; lt
= lt
+ c
(ht_0
)
380 ADD,DC ht_1
,%r0,ht_1 ; ht+
+
381 STD lt_0
,0(r_ptr
) ; rp
[0] = lt
382 STD lt_1
,8(r_ptr
) ; rp
[1] = lt
384 COPY ht_1
,%ret0 ; carry
= ht
385 LDO
-2(num
),num ; num
= num
- 2;
386 LDO
16(a_ptr
),a_ptr ; ap
+= 2
387 CMPIB
,<= 2,num
,bn_mul_words_unroll2
388 LDO
16(r_ptr
),r_ptr ; rp+
+
390 CMPIB
,=,N
0,num
,bn_mul_words_exit ; are we done?
393 ; Top of loop aligned on
64-byte boundary
395 bn_mul_words_single_top
396 FLDD
0(a_ptr
),t_float_0 ; load up
64-bit value
(fr8L
) ht
(L)/lt
(R
)
398 XMPYU fht_0
,fw_l
,fm1 ; m1
= ht
*fw_l
399 FSTD fm1
,-16(%sp
) ;
-16(sp
) = m1
400 XMPYU flt_0
,fw_h
,fm ; m
= lt
*fw_h
401 FSTD
fm,-8(%sp
) ;
-8(sp
) = m
402 XMPYU fht_0
,fw_h
,ht_temp ; ht_temp
= ht
*fw_h
403 FSTD ht_temp
,-24(%sp
) ;
-24(sp
) = ht
404 XMPYU flt_0
,fw_l
,lt_temp ; lt_temp
= lt
*fw_l
405 FSTD lt_temp
,-32(%sp
) ;
-32(sp
) = lt
409 ADD,L m_0
,m1_0
,tmp_0 ; tmp_0
= m
+ m1;
413 CMPCLR
,*>>= tmp_0
,m1_0
,%r0 ; if
(m
< m1
)
414 ADD,L ht_0
,top_overflow
,ht_0 ; ht
+= (1<<32)
416 EXTRD
,U tmp_0
,31,32,m_0 ; m
>>32
417 DEPD
,Z tmp_0
,31,32,m1_0 ; m1
= m
<<32
419 ADD,L ht_0
,m_0
,ht_0 ; ht+
= (m
>>32)
420 ADD lt_0
,m1_0
,lt_0 ; lt
= lt+m1;
421 ADD,DC ht_0
,%r0,ht_0 ; ht+
+
423 ADD %ret0
,lt_0
,lt_0 ; lt
= lt
+ c;
424 ADD,DC ht_0
,%r0,ht_0 ; ht+
+
426 COPY ht_0
,%ret0 ; copy carry
427 STD lt_0
,0(r_ptr
) ; rp
[0] = lt
431 LDD
-96(%sp
),%r7 ; restore
r7
432 LDD
-104(%sp
),%r6 ; restore
r6
433 LDD
-112(%sp
),%r5 ; restore
r5
434 LDD
-120(%sp
),%r4 ; restore
r4
436 LDD
,MB
-128(%sp
),%r3 ; restore
r3
437 .PROCEND ;in=23,24,25,26,29;out=28;
439 ;
----------------------------------------------------------------------------
441 ;void bn_sqr_words
(BN_ULONG
*rp
, BN_ULONG
*ap
, int num
)
450 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
451 .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
455 STD %r3,0(%sp
) ; save
r3
456 STD %r4,8(%sp
) ; save
r4
458 STD %r5,16(%sp
) ; save
r5
460 CMPIB
,>= 0,num
,bn_sqr_words_exit
461 LDO
128(%sp
),%sp ; bump stack
464 ; If only
1, the goto straight to cleanup
466 CMPIB
,= 1,num
,bn_sqr_words_single_top
467 DEPDI
,Z
-1,32,33,high_mask ; Create Mask
0xffffffff80000000L
470 ; This loop is unrolled
2 times
(64-byte aligned as well
)
474 FLDD
0(a_ptr
),t_float_0 ;
a[0]
475 FLDD
8(a_ptr
),t_float_1 ;
a[1]
476 XMPYU fht_0
,flt_0
,fm ; m
[0]
477 XMPYU fht_1
,flt_1
,fm_1 ; m
[1]
479 FSTD
fm,-24(%sp
) ; store m
[0]
480 FSTD fm_1
,-56(%sp
) ; store m
[1]
481 XMPYU flt_0
,flt_0
,lt_temp ; lt
[0]
482 XMPYU flt_1
,flt_1
,lt_temp_1 ; lt
[1]
484 FSTD lt_temp
,-16(%sp
) ; store lt
[0]
485 FSTD lt_temp_1
,-48(%sp
) ; store lt
[1]
486 XMPYU fht_0
,fht_0
,ht_temp ; ht
[0]
487 XMPYU fht_1
,fht_1
,ht_temp_1 ; ht
[1]
489 FSTD ht_temp
,-8(%sp
) ; store ht
[0]
490 FSTD ht_temp_1
,-40(%sp
) ; store ht
[1]
494 AND m_0
,high_mask
,tmp_0 ; m
[0] & Mask
495 AND m_1
,high_mask
,tmp_1 ; m
[1] & Mask
496 DEPD
,Z m_0
,30,31,m_0 ; m
[0] << 32+1
497 DEPD
,Z m_1
,30,31,m_1 ; m
[1] << 32+1
501 EXTRD
,U tmp_0
,32,33,tmp_0 ; tmp_0
= m
[0]&Mask
>> 32-1
502 EXTRD
,U tmp_1
,32,33,tmp_1 ; tmp_1
= m
[1]&Mask
>> 32-1
506 ADD,L ht_0
,tmp_0
,ht_0 ; ht
[0] += tmp_0
507 ADD,L ht_1
,tmp_1
,ht_1 ; ht
[1] += tmp_1
509 ADD lt_0
,m_0
,lt_0 ; lt
= lt+m
510 ADD,DC ht_0
,%r0,ht_0 ; ht
[0]++
511 STD lt_0
,0(r_ptr
) ; rp
[0] = lt
[0]
512 STD ht_0
,8(r_ptr
) ; rp
[1] = ht
[1]
514 ADD lt_1
,m_1
,lt_1 ; lt
= lt+m
515 ADD,DC ht_1
,%r0,ht_1 ; ht
[1]++
516 STD lt_1
,16(r_ptr
) ; rp
[2] = lt
[1]
517 STD ht_1
,24(r_ptr
) ; rp
[3] = ht
[1]
519 LDO
-2(num
),num ; num
= num
- 2;
520 LDO
16(a_ptr
),a_ptr ; ap
+= 2
521 CMPIB
,<= 2,num
,bn_sqr_words_unroll2
522 LDO
32(r_ptr
),r_ptr ; rp
+= 4
524 CMPIB
,=,N
0,num
,bn_sqr_words_exit ; are we done?
527 ; Top of loop aligned on
64-byte boundary
529 bn_sqr_words_single_top
530 FLDD
0(a_ptr
),t_float_0 ; load up
64-bit value
(fr8L
) ht
(L)/lt
(R
)
532 XMPYU fht_0
,flt_0
,fm ; m
533 FSTD
fm,-24(%sp
) ; store m
535 XMPYU flt_0
,flt_0
,lt_temp ; lt
536 FSTD lt_temp
,-16(%sp
) ; store lt
538 XMPYU fht_0
,fht_0
,ht_temp ; ht
539 FSTD ht_temp
,-8(%sp
) ; store ht
541 LDD
-24(%sp
),m_0 ; load m
542 AND m_0
,high_mask
,tmp_0 ; m
& Mask
543 DEPD
,Z m_0
,30,31,m_0 ; m
<< 32+1
544 LDD
-16(%sp
),lt_0 ; lt
546 LDD
-8(%sp
),ht_0 ; ht
547 EXTRD
,U tmp_0
,32,33,tmp_0 ; tmp_0
= m
&Mask
>> 32-1
548 ADD m_0
,lt_0
,lt_0 ; lt
= lt+m
549 ADD,L ht_0
,tmp_0
,ht_0 ; ht
+= tmp_0
550 ADD,DC ht_0
,%r0,ht_0 ; ht+
+
552 STD lt_0
,0(r_ptr
) ; rp
[0] = lt
553 STD ht_0
,8(r_ptr
) ; rp
[1] = ht
557 LDD
-112(%sp
),%r5 ; restore
r5
558 LDD
-120(%sp
),%r4 ; restore
r4
561 .PROCEND ;in=23,24,25,26,29;out=28;
564 ;
----------------------------------------------------------------------------
566 ;BN_ULONG bn_add_words
(BN_ULONG
*r
, BN_ULONG
*a, BN_ULONG
*b, int n
)
581 .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
584 CMPIB
,>= 0,n
,bn_add_words_exit
585 COPY
%r0,%ret0 ; return
0 by default
588 ; If
2 or more numbers do the loop
590 CMPIB
,= 1,n
,bn_add_words_single_top
594 ; This loop is unrolled
2 times
(64-byte aligned as well
)
599 ADD t,%ret0
,t ;
t = t+c;
600 ADD,DC
%r0,%r0,%ret0 ; set c to carry
601 ADD t,b,l ;
l = t + b[0]
602 ADD,DC
%ret0
,%r0,%ret0 ; c+
= carry
607 ADD t,%ret0
,t ;
t = t+c;
608 ADD,DC
%r0,%r0,%ret0 ; set c to carry
609 ADD t,b,l ;
l = t + b[0]
610 ADD,DC
%ret0
,%r0,%ret0 ; c+
= carry
617 CMPIB
,<= 2,n
,bn_add_words_unroll2
620 CMPIB
,=,N
0,n
,bn_add_words_exit ; are we done?
622 bn_add_words_single_top
626 ADD t,%ret0
,t ;
t = t+c;
627 ADD,DC
%r0,%r0,%ret0 ; set c to carry
(could use CMPCLR??
)
628 ADD t,b,l ;
l = t + b[0]
629 ADD,DC
%ret0
,%r0,%ret0 ; c+
= carry
636 .PROCEND ;in=23,24,25,26,29;out=28;
638 ;
----------------------------------------------------------------------------
640 ;BN_ULONG bn_sub_words
(BN_ULONG
*r
, BN_ULONG
*a, BN_ULONG
*b, int n
)
656 .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
660 CMPIB
,>= 0,n
,bn_sub_words_exit
661 COPY
%r0,%ret0 ; return
0 by default
664 ; If
2 or more numbers do the loop
666 CMPIB
,= 1,n
,bn_sub_words_single_top
670 ; This loop is unrolled
2 times
(64-byte aligned as well
)
675 SUB t1
,t2
,sub_tmp1 ; t3
= t1-t2;
676 SUB sub_tmp1
,%ret0
,sub_tmp1 ; t3
= t3- c;
678 CMPCLR
,*>> t1
,t2
,sub_tmp2 ; clear if t1
> t2
683 STD sub_tmp1
,0(r_ptr
)
687 SUB t1
,t2
,sub_tmp1 ; t3
= t1-t2;
688 SUB sub_tmp1
,%ret0
,sub_tmp1 ; t3
= t3- c;
689 CMPCLR
,*>> t1
,t2
,sub_tmp2 ; clear if t1
> t2
694 STD sub_tmp1
,8(r_ptr
)
700 CMPIB
,<= 2,n
,bn_sub_words_unroll2
703 CMPIB
,=,N
0,n
,bn_sub_words_exit ; are we done?
705 bn_sub_words_single_top
708 SUB t1
,t2
,sub_tmp1 ; t3
= t1-t2;
709 SUB sub_tmp1
,%ret0
,sub_tmp1 ; t3
= t3- c;
710 CMPCLR
,*>> t1
,t2
,sub_tmp2 ; clear if t1
> t2
716 STD sub_tmp1
,0(r_ptr
)
722 .PROCEND ;in=23,24,25,26,29;out=28;
724 ;
------------------------------------------------------------------------------
726 ; unsigned long bn_div_words
(unsigned long h
, unsigned long
l, unsigned long d
)
732 ; This is mainly just modified assembly from the compiler
, thus the
733 ; lack of variable names.
735 ;
------------------------------------------------------------------------------
738 .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
739 .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
740 .IMPORT BN_num_bits_word,CODE,NO_RELOCATION
742 .IMPORT fprintf,CODE,NO_RELOCATION
743 .IMPORT abort,CODE,NO_RELOCATION
744 .IMPORT $$div2U,MILLICODE
756 STD %r27,-288(%r30) ; save gp
758 COPY
%r24,%r3 ; save d
759 COPY
%r26,%r4 ; save h
(high
64-bits
)
760 LDO
-1(%r0),%ret0 ; return
-1 by default
762 CMPB
,*= %r0,%arg2
,$D3 ; if
(d
== 0)
763 COPY
%r25,%r5 ; save
l (low
64-bits
)
765 LDO
-48(%r30),%r29 ; create ap
766 .CALL ;in=26,29;out=28;
767 B,L BN_num_bits_word
,%r2
769 LDD
-288(%r30),%r27 ; restore gp
772 CMPB
,= %r21,%ret0
,$
00000012 ;if
(i
== 64) (forward
)
775 DEPDI
,Z
-1,%sar
,1,%r29
776 CMPB
,*<<,N
%r29,%r4,bn_div_err_case ; if
(h
> 1<<i
) (forward
)
779 SUBI 64,%r24,%r31 ; i
= 64 - i;
780 CMPCLR
,*<< %r4,%r3,%r0 ; if
(h
>= d
)
781 SUB %r4,%r3,%r4 ; h
-= d
782 CMPB
,= %r31,%r0,$
0000001A ; if
(i
)
783 COPY
%r0,%r10 ; ret
= 0
784 MTSARCM
%r31 ; i to shift
785 DEPD
,Z
%r3,%sar
,64,%r3 ; d
<<= i;
786 SUBI 64,%r31,%r19 ;
64 - i; redundent
787 MTSAR
%r19 ;
(64 -i
) to shift
788 SHRPD
%r4,%r5,%sar
,%r4 ;
l>> (64-i
)
789 MTSARCM
%r31 ; i to shift
790 DEPD
,Z
%r5,%sar
,64,%r5 ;
l <<= i;
793 DEPDI
,Z
-1,31,32,%r19
794 EXTRD
,U
%r3,31,32,%r6 ; dh
=(d
&0xfff)>>32
795 EXTRD
,U
%r3,63,32,%r8 ; dl
= d
&0xffffff
797 STD %r3,-280(%r30) ;
"d" to stack
800 DEPDI
,Z
-1,63,32,%r29 ;
801 EXTRD
,U
%r4,31,32,%r31 ; h
>> 32
802 CMPB
,*=,N
%r31,%r6,$D2 ; if
((h
>>32) != dh
)(forward
) div
804 EXTRD
,U
%r4,31,32,%r25
806 .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
808 EXTRD
,U
%r6,31,32,%r23
811 STD %r29,-272(%r30) ; q
812 AND %r5,%r19,%r24 ;
t & 0xffffffff00000000;
813 EXTRD
,U
%r24,31,32,%r24 ; ???
814 FLDD
-272(%r30),%fr7 ; q
815 FLDD
-280(%r30),%fr8 ; d
816 XMPYU
%fr8L
,%fr7L
,%fr10
817 FSTD
%fr10
,-256(%r30)
818 XMPYU
%fr8L
,%fr7R
,%fr22
819 FSTD
%fr22
,-264(%r30)
820 XMPYU
%fr8R
,%fr7L
,%fr11
821 XMPYU
%fr8R
,%fr7R
,%fr23
822 FSTD
%fr11
,-232(%r30)
823 FSTD
%fr23
,-240(%r30)
825 DEPD
,Z
%r28,31,32,%r2
829 DEPD
,Z
%r22,31,32,%r22
831 B $
00000024 ; enter loop
840 CMPB
,*<>,N
%r0,%r26,$
00000046 ;
(forward
)
841 DEPD
,Z
%r25,31,32,%r20
843 CMPB
,*<<,N
%r21,%r23,$
0000002A ;
(backward
)
845 ;
-------------Break path-
--------------------
848 DEPD
,Z
%r23,31,32,%r25 ;tl
849 EXTRD
,U
%r23,31,32,%r26 ;
t
850 AND %r25,%r19,%r24 ;tl
= (tl
<<32)&0xfffffff0000000L
851 ADD,L %r31,%r26,%r31 ;th
+= t;
852 CMPCLR
,*>>= %r5,%r24,%r0 ;if
(l<tl
)
853 LDO
1(%r31),%r31 ; th+
+;
854 CMPB
,*<<=,N
%r31,%r4,$
00000036 ;if
(n
< th
) (forward
)
855 LDO
-1(%r29),%r29 ;q-
-;
856 ADD,L %r4,%r3,%r4 ;h
+= d;
858 ADDIB
,=,N
-1,%r9,$D1 ;if
(--count
== 0) break
(forward
)
859 SUB %r5,%r24,%r28 ;
l -= tl;
860 SUB %r4,%r31,%r24 ; h
-= th;
861 SHRPD
%r24,%r28,32,%r4 ; h
= ((h
<<32)|
(l>>32));
862 DEPD
,Z
%r29,31,32,%r10 ; ret
= q
<<32
864 DEPD
,Z
%r28,31,32,%r5 ;
l = l << 32
867 OR %r10,%r29,%r28 ; ret |
= q
880 LDD
,MB
-352(%r30),%r3
884 ADDIL
L'bn_div_words-bn_div_err_case,%r6,%r1
885 LDO R'bn_div_words-bn_div_err_case
(%r1),%r6
886 ADDIL LT
'__iob,%r27,%r1
887 LDD RT'__iob
(%r1),%r26
888 ADDIL
L'C$4-bn_div_words,%r6,%r1
889 LDO R'C$
4-bn_div_words
(%r1),%r25
891 .CALL ;in=24,25,26,29;out=28;
901 .PROCEND ;in=24,25,26,29;out=28;
903 ;
----------------------------------------------------------------------------
905 ; Registers to hold
64-bit values to manipulate. The
"L" part
906 ; of the register corresponds to the upper
32-bits
, while the
"R"
907 ; part corresponds to the lower
32-bits
909 ; Note
, that when using b6
and b7
, the code must save these before
910 ; using them because they are callee save registers
913 ; Floating point registers to use to save values that
914 ; are manipulated. These don
't collide with ftemp1-6 and
915 ; are all caller save registers
958 ; Temporary floating point variables, these are all caller save
967 ; The B set of registers when used.
994 c1 .reg %r21 ; only reg
995 temp1 .reg %r20 ; only reg
996 temp2 .reg %r19 ; only reg
997 temp3 .reg %r31 ; only reg
1007 SQR_ADD_C .macro A0L,A0R,C1,C2,C3
1008 XMPYU A0L,A0R,ftemp1 ; m
1009 FSTD ftemp1,-24(%sp) ; store m
1011 XMPYU A0R,A0R,ftemp2 ; lt
1012 FSTD ftemp2,-16(%sp) ; store lt
1014 XMPYU A0L,A0L,ftemp3 ; ht
1015 FSTD ftemp3,-8(%sp) ; store ht
1017 LDD -24(%sp),m ; load m
1018 AND m,high_mask,temp2 ; m & Mask
1019 DEPD,Z m,30,31,temp3 ; m << 32+1
1020 LDD -16(%sp),lt ; lt
1023 EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1
1024 ADD temp3,lt,lt ; lt = lt+m
1025 ADD,L ht,temp1,ht ; ht += temp1
1026 ADD,DC ht,%r0,ht ; ht++
1028 ADD C1,lt,C1 ; c1=c1+lt
1029 ADD,DC ht,%r0,ht ; ht++
1031 ADD C2,ht,C2 ; c2=c2+ht
1032 ADD,DC C3,%r0,C3 ; c3++
1035 SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3
1036 XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht
1037 FSTD ftemp1,-16(%sp) ;
1038 XMPYU A0R,A1L,ftemp2 ; m = bh*lt
1039 FSTD ftemp2,-8(%sp) ;
1040 XMPYU A0R,A1R,ftemp3 ; lt = bl*lt
1041 FSTD ftemp3,-32(%sp)
1042 XMPYU A0L,A1L,ftemp4 ; ht = bh*ht
1043 FSTD ftemp4,-24(%sp) ;
1045 LDD -8(%sp),m ; r21 = m
1046 LDD -16(%sp),m1 ; r19 = m1
1049 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1050 LDD -24(%sp),ht ; r24 = ht
1052 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1053 ADD,L ht,high_one,ht ; ht+=high_one
1055 EXTRD,U m,31,32,temp1 ; m >> 32
1056 LDD -32(%sp),lt ; lt
1057 ADD,L ht,temp1,ht ; ht+= m>>32
1058 ADD lt,temp3,lt ; lt = lt+m1
1059 ADD,DC ht,%r0,ht ; ht++
1061 ADD ht,ht,ht ; ht=ht+ht;
1062 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1064 ADD lt,lt,lt ; lt=lt+lt;
1065 ADD,DC ht,%r0,ht ; add in carry (ht++)
1067 ADD C1,lt,C1 ; c1=c1+lt
1068 ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++)
1069 LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise
1071 ADD C2,ht,C2 ; c2 = c2 + ht
1072 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1076 ;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
1083 .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1084 .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1088 STD %r3,0(%sp) ; save r3
1089 STD %r4,8(%sp) ; save r4
1090 STD %r5,16(%sp) ; save r5
1091 STD %r6,24(%sp) ; save r6
1100 LDO 128(%sp),%sp ; bump stack
1101 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1102 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1105 ; Load up all of the values we are going to use
1116 SQR_ADD_C a0L,a0R,c1,c2,c3
1117 STD c1,0(r_ptr) ; r[0] = c1;
1120 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1121 STD c2,8(r_ptr) ; r[1] = c2;
1124 SQR_ADD_C a1L,a1R,c3,c1,c2
1125 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1126 STD c3,16(r_ptr) ; r[2] = c3;
1129 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1130 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1131 STD c1,24(r_ptr) ; r[3] = c1;
1134 SQR_ADD_C a2L,a2R,c2,c3,c1
1135 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1136 SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
1137 STD c2,32(r_ptr) ; r[4] = c2;
1140 SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
1141 SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
1142 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1143 STD c3,40(r_ptr) ; r[5] = c3;
1146 SQR_ADD_C a3L,a3R,c1,c2,c3
1147 SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
1148 SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
1149 SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
1150 STD c1,48(r_ptr) ; r[6] = c1;
1153 SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
1154 SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
1155 SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
1156 SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
1157 STD c2,56(r_ptr) ; r[7] = c2;
1160 SQR_ADD_C a4L,a4R,c3,c1,c2
1161 SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
1162 SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
1163 SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
1164 STD c3,64(r_ptr) ; r[8] = c3;
1167 SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
1168 SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
1169 SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
1170 STD c1,72(r_ptr) ; r[9] = c1;
1173 SQR_ADD_C a5L,a5R,c2,c3,c1
1174 SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
1175 SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
1176 STD c2,80(r_ptr) ; r[10] = c2;
1179 SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
1180 SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
1181 STD c3,88(r_ptr) ; r[11] = c3;
1184 SQR_ADD_C a6L,a6R,c1,c2,c3
1185 SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
1186 STD c1,96(r_ptr) ; r[12] = c1;
1189 SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
1190 STD c2,104(r_ptr) ; r[13] = c2;
1193 SQR_ADD_C a7L,a7R,c3,c1,c2
1194 STD c3, 112(r_ptr) ; r[14] = c3
1195 STD c1, 120(r_ptr) ; r[15] = c1
1198 LDD -104(%sp),%r6 ; restore r6
1199 LDD -112(%sp),%r5 ; restore r5
1200 LDD -120(%sp),%r4 ; restore r4
1202 LDD,MB -128(%sp),%r3
1206 ;-----------------------------------------------------------------------------
1208 ;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
1215 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1216 .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1219 STD %r3,0(%sp) ; save r3
1220 STD %r4,8(%sp) ; save r4
1221 STD %r5,16(%sp) ; save r5
1222 STD %r6,24(%sp) ; save r6
1231 LDO 128(%sp),%sp ; bump stack
1232 DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L
1233 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1236 ; Load up all of the values we are going to use
1247 SQR_ADD_C a0L,a0R,c1,c2,c3
1249 STD c1,0(r_ptr) ; r[0] = c1;
1252 SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1254 STD c2,8(r_ptr) ; r[1] = c2;
1257 SQR_ADD_C a1L,a1R,c3,c1,c2
1258 SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1260 STD c3,16(r_ptr) ; r[2] = c3;
1263 SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1264 SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1266 STD c1,24(r_ptr) ; r[3] = c1;
1269 SQR_ADD_C a2L,a2R,c2,c3,c1
1270 SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1272 STD c2,32(r_ptr) ; r[4] = c2;
1275 SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1276 STD c3,40(r_ptr) ; r[5] = c3;
1279 SQR_ADD_C a3L,a3R,c1,c2,c3
1280 STD c1,48(r_ptr) ; r[6] = c1;
1281 STD c2,56(r_ptr) ; r[7] = c2;
1284 LDD -104(%sp),%r6 ; restore r6
1285 LDD -112(%sp),%r5 ; restore r5
1286 LDD -120(%sp),%r4 ; restore r4
1288 LDD,MB -128(%sp),%r3
1293 ;---------------------------------------------------------------------------
1295 MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3
1296 XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht
1297 FSTD ftemp1,-16(%sp) ;
1298 XMPYU A0R,B0L,ftemp2 ; m = bh*lt
1299 FSTD ftemp2,-8(%sp) ;
1300 XMPYU A0R,B0R,ftemp3 ; lt = bl*lt
1301 FSTD ftemp3,-32(%sp)
1302 XMPYU A0L,B0L,ftemp4 ; ht = bh*ht
1303 FSTD ftemp4,-24(%sp) ;
1305 LDD -8(%sp),m ; r21 = m
1306 LDD -16(%sp),m1 ; r19 = m1
1309 DEPD,Z m,31,32,temp3 ; (m+m1<<32)
1310 LDD -24(%sp),ht ; r24 = ht
1312 CMPCLR,*>>= m,m1,%r0 ; if (m < m1)
1313 ADD,L ht,high_one,ht ; ht+=high_one
1315 EXTRD,U m,31,32,temp1 ; m >> 32
1316 LDD -32(%sp),lt ; lt
1317 ADD,L ht,temp1,ht ; ht+= m>>32
1318 ADD lt,temp3,lt ; lt = lt+m1
1319 ADD,DC ht,%r0,ht ; ht++
1321 ADD C1,lt,C1 ; c1=c1+lt
1322 ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise
1324 ADD C2,ht,C2 ; c2 = c2 + ht
1325 ADD,DC C3,%r0,C3 ; add in carry (c3++)
1330 ;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1338 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1339 .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1343 STD %r3,0(%sp) ; save r3
1344 STD %r4,8(%sp) ; save r4
1345 STD %r5,16(%sp) ; save r5
1346 STD %r6,24(%sp) ; save r6
1347 FSTD %fr12,32(%sp) ; save r6
1348 FSTD %fr13,40(%sp) ; save r7
1357 LDO 128(%sp),%sp ; bump stack
1358 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1361 ; Load up all of the values we are going to use
1381 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1385 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1386 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1390 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1391 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1392 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1396 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1397 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1398 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1399 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1403 MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
1404 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1405 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1406 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1407 MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
1411 MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
1412 MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
1413 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1414 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1415 MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
1416 MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
1420 MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
1421 MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
1422 MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
1423 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1424 MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
1425 MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
1426 MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
1430 MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
1431 MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
1432 MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
1433 MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
1434 MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
1435 MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
1436 MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
1437 MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
1441 MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
1442 MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
1443 MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
1444 MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
1445 MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
1446 MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
1447 MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
1451 MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
1452 MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
1453 MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
1454 MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
1455 MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
1456 MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
1460 MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
1461 MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
1462 MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
1463 MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
1464 MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
1468 MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
1469 MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
1470 MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
1471 MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
1475 MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
1476 MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
1477 MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
1481 MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
1482 MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
1486 MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
1493 LDD -104(%sp),%r6 ; restore r6
1494 LDD -112(%sp),%r5 ; restore r5
1495 LDD -120(%sp),%r4 ; restore r4
1497 LDD,MB -128(%sp),%r3
1501 ;-----------------------------------------------------------------------------
1503 ;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1511 .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1512 .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1516 STD %r3,0(%sp) ; save r3
1517 STD %r4,8(%sp) ; save r4
1518 STD %r5,16(%sp) ; save r5
1519 STD %r6,24(%sp) ; save r6
1520 FSTD %fr12,32(%sp) ; save r6
1521 FSTD %fr13,40(%sp) ; save r7
1530 LDO 128(%sp),%sp ; bump stack
1531 DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32
1534 ; Load up all of the values we are going to use
1546 MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1550 MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1551 MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1555 MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1556 MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1557 MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1561 MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1562 MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1563 MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1564 MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1568 MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1569 MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1570 MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1574 MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1575 MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1579 MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1586 LDD -104(%sp),%r6 ; restore r6
1587 LDD -112(%sp),%r5 ; restore r5
1588 LDD -120(%sp),%r4 ; restore r4
1590 LDD,MB -128(%sp),%r3
1597 .SPACE $PRIVATE$,SORT=16
1598 .IMPORT $global$,DATA
1601 .SUBSPA $LIT$,ACCESS=0x2c
1604 .STRINGZ "Division would overflow (%d)\n"