3 .asciiz "mips3.s, Version 1.2"
4 .asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
10 .globl bn_mul_add_words
14 bgtz $
6,bn_mul_add_words_internal
21 .ent bn_mul_add_words_internal
22 bn_mul_add_words_internal
:
27 beqz $
8,.L_bn_mul_add_words_tail
29 .L_bn_mul_add_words_loop:
37 sltu $
2,$
13,$
2 # All manuals say it "compares 32-bit
38 # values", but it seems to work fine
39 # even on 64-bit registers.
88 bgtzl $
8,.L_bn_mul_add_words_loop
91 beqz $
6,.L_bn_mul_add_words_return
94 .L_bn_mul_add_words_tail:
109 beqz $
6,.L_bn_mul_add_words_return
124 beqz $
6,.L_bn_mul_add_words_return
139 .L_bn_mul_add_words_return:
143 .end bn_mul_add_words_internal
150 bgtz $
6,bn_mul_words_internal
157 .ent bn_mul_words_internal
158 bn_mul_words_internal
:
163 beqz $
8,.L_bn_mul_words_tail
165 .L_bn_mul_words_loop:
205 bgtzl $
8,.L_bn_mul_words_loop
208 beqz $
6,.L_bn_mul_words_return
211 .L_bn_mul_words_tail:
222 beqz $
6,.L_bn_mul_words_return
233 beqz $
6,.L_bn_mul_words_return
244 .L_bn_mul_words_return:
248 .end bn_mul_words_internal
255 bgtz $
6,bn_sqr_words_internal
262 .ent bn_sqr_words_internal
263 bn_sqr_words_internal
:
268 beqz $
8,.L_bn_sqr_words_tail
270 .L_bn_sqr_words_loop:
304 bgtzl $
8,.L_bn_sqr_words_loop
307 beqz $
6,.L_bn_sqr_words_return
310 .L_bn_sqr_words_tail:
319 beqz $
6,.L_bn_sqr_words_return
328 beqz $
6,.L_bn_sqr_words_return
337 .L_bn_sqr_words_return:
342 .end bn_sqr_words_internal
349 bgtz $
7,bn_add_words_internal
356 .ent bn_add_words_internal
357 bn_add_words_internal
:
362 beqz $
1,.L_bn_add_words_tail
364 .L_bn_add_words_loop:
406 bgtzl $
1,.L_bn_add_words_loop
409 beqz $
7,.L_bn_add_words_return
412 .L_bn_add_words_tail:
423 beqz $
7,.L_bn_add_words_return
434 beqz $
7,.L_bn_add_words_return
445 .L_bn_add_words_return:
450 .end bn_add_words_internal
457 bgtz $
7,bn_sub_words_internal
464 .ent bn_sub_words_internal
465 bn_sub_words_internal
:
470 beqz $
1,.L_bn_sub_words_tail
472 .L_bn_sub_words_loop:
515 bgtzl $
1,.L_bn_sub_words_loop
518 beqz $
7,.L_bn_sub_words_return
521 .L_bn_sub_words_tail:
532 beqz $
7,.L_bn_sub_words_return
543 beqz $
7,.L_bn_sub_words_return
554 .L_bn_sub_words_return:
558 .end bn_sub_words_internal
561 .globl bn_div_3_words
565 move $
7,$
4 # we know that bn_div_words does not
566 # touch $7, $10, $11 and preserves $6
567 # so that we can save two arguments
568 # and return address in registers
569 # instead of stack:-)
573 bne $
4,$
6,bn_div_3_words_internal
581 .ent bn_div_3_words_internal
582 bn_div_3_words_internal
:
593 .L_bn_div_3_words_inner_loop:
594 bnez $
24,.L_bn_div_3_words_inner_loop_done
606 beqzl $
1,.L_bn_div_3_words_inner_loop
609 .L_bn_div_3_words_inner_loop_done:
613 .end bn_div_3_words_internal
620 bnez $
6,bn_div_words_internal
621 li $
2,-1 # I would rather signal div-by-zero
622 # which can be done with 'break 7'
628 .ent bn_div_words_internal
629 bn_div_words_internal
:
631 bltz $
6,.L_bn_div_words_body
645 break
6 # signal overflow
650 .L_bn_div_words_body:
660 srl $
8,4*4 # q=0xffffffff
661 beq $
3,$
9,.L_bn_div_words_skip_div1
664 .L_bn_div_words_skip_div1:
666 sll $
15,$
4,4*4 # bits
671 .L_bn_div_words_inner_loop1:
679 beqz $
1,.L_bn_div_words_inner_loop1_done
682 b .L_bn_div_words_inner_loop1
685 .L_bn_div_words_inner_loop1_done:
693 srl $
8,4*4 # q=0xffffffff
694 beq $
3,$
9,.L_bn_div_words_skip_div2
697 .L_bn_div_words_skip_div2:
699 sll $
15,$
4,4*4 # bits
704 .L_bn_div_words_inner_loop2:
712 beqz $
1,.L_bn_div_words_inner_loop2_done
715 b .L_bn_div_words_inner_loop2
718 .L_bn_div_words_inner_loop2_done:
722 srl $
3,$
4,$
25 # $3 contains remainder if anybody wants it
723 srl $
6,$
25 # restore $6
729 .end bn_div_words_internal
747 lw $
12,0($
5) # If compiled with -mips3 option on
748 # R5000 box assembler barks on this
749 # 1ine with "should not have mult/div
750 # as last instruction in bb (R10K
751 # bug)" warning. If anybody out there
752 # has a clue about how to circumvent
753 # this do send me a note.
754 # <appro@fy.chalmers.se>
759 multu $
12,$
8 # mul_add_c(a[0],b[0],c1,c2,c3);
769 multu $
12,$
9 # mul_add_c(a[0],b[1],c2,c3,c1);
778 multu $
13,$
8 # mul_add_c(a[1],b[0],c2,c3,c1);
782 sw $
2,0($
4) # r[0]=c1;
787 multu $
14,$
8 # mul_add_c(a[2],b[0],c3,c1,c2);
791 sw $
3,4($
4) # r[1]=c2;
797 multu $
13,$
9 # mul_add_c(a[1],b[1],c3,c1,c2);
804 multu $
12,$
10 # mul_add_c(a[0],b[2],c3,c1,c2);
812 multu $
12,$
11 # mul_add_c(a[0],b[3],c1,c2,c3);
817 sw $
7,2*4($
4) # r[2]=c3;
823 multu $
13,$
10 # mul_add_c(a[1],b[2],c1,c2,c3);
831 multu $
14,$
9 # mul_add_c(a[2],b[1],c1,c2,c3);
840 multu $
15,$
8 # mul_add_c(a[3],b[0],c1,c2,c3);
849 multu $
16,$
8 # mul_add_c(a[4],b[0],c2,c3,c1);
854 sw $
2,3*4($
4) # r[3]=c1;
860 multu $
15,$
9 # mul_add_c(a[3],b[1],c2,c3,c1);
868 multu $
14,$
10 # mul_add_c(a[2],b[2],c2,c3,c1);
877 multu $
13,$
11 # mul_add_c(a[1],b[3],c2,c3,c1);
886 multu $
12,$
17 # mul_add_c(a[0],b[4],c2,c3,c1);
895 multu $
12,$
19 # mul_add_c(a[0],b[5],c3,c1,c2);
900 sw $
3,4*4($
4) # r[4]=c2;
906 multu $
13,$
17 # mul_add_c(a[1],b[4],c3,c1,c2);
914 multu $
14,$
11 # mul_add_c(a[2],b[3],c3,c1,c2);
923 multu $
15,$
10 # mul_add_c(a[3],b[2],c3,c1,c2);
932 multu $
16,$
9 # mul_add_c(a[4],b[1],c3,c1,c2);
941 multu $
18,$
8 # mul_add_c(a[5],b[0],c3,c1,c2);
950 multu $
20,$
8 # mul_add_c(a[6],b[0],c1,c2,c3);
955 sw $
7,5*4($
4) # r[5]=c3;
961 multu $
18,$
9 # mul_add_c(a[5],b[1],c1,c2,c3);
969 multu $
16,$
10 # mul_add_c(a[4],b[2],c1,c2,c3);
978 multu $
15,$
11 # mul_add_c(a[3],b[3],c1,c2,c3);
987 multu $
14,$
17 # mul_add_c(a[2],b[4],c1,c2,c3);
996 multu $
13,$
19 # mul_add_c(a[1],b[5],c1,c2,c3);
1005 multu $
12,$
21 # mul_add_c(a[0],b[6],c1,c2,c3);
1014 multu $
12,$
6 # mul_add_c(a[0],b[7],c2,c3,c1);
1019 sw $
2,6*4($
4) # r[6]=c1;
1025 multu $
13,$
21 # mul_add_c(a[1],b[6],c2,c3,c1);
1033 multu $
14,$
19 # mul_add_c(a[2],b[5],c2,c3,c1);
1042 multu $
15,$
17 # mul_add_c(a[3],b[4],c2,c3,c1);
1051 multu $
16,$
11 # mul_add_c(a[4],b[3],c2,c3,c1);
1060 multu $
18,$
10 # mul_add_c(a[5],b[2],c2,c3,c1);
1069 multu $
20,$
9 # mul_add_c(a[6],b[1],c2,c3,c1);
1078 multu $
5,$
8 # mul_add_c(a[7],b[0],c2,c3,c1);
1087 multu $
5,$
9 # mul_add_c(a[7],b[1],c3,c1,c2);
1092 sw $
3,7*4($
4) # r[7]=c2;
1098 multu $
20,$
10 # mul_add_c(a[6],b[2],c3,c1,c2);
1106 multu $
18,$
11 # mul_add_c(a[5],b[3],c3,c1,c2);
1115 multu $
16,$
17 # mul_add_c(a[4],b[4],c3,c1,c2);
1124 multu $
15,$
19 # mul_add_c(a[3],b[5],c3,c1,c2);
1133 multu $
14,$
21 # mul_add_c(a[2],b[6],c3,c1,c2);
1142 multu $
13,$
6 # mul_add_c(a[1],b[7],c3,c1,c2);
1151 multu $
14,$
6 # mul_add_c(a[2],b[7],c1,c2,c3);
1156 sw $
7,8*4($
4) # r[8]=c3;
1162 multu $
15,$
21 # mul_add_c(a[3],b[6],c1,c2,c3);
1170 multu $
16,$
19 # mul_add_c(a[4],b[5],c1,c2,c3);
1179 multu $
18,$
17 # mul_add_c(a[5],b[4],c1,c2,c3);
1188 multu $
20,$
11 # mul_add_c(a[6],b[3],c1,c2,c3);
1197 multu $
5,$
10 # mul_add_c(a[7],b[2],c1,c2,c3);
1206 multu $
5,$
11 # mul_add_c(a[7],b[3],c2,c3,c1);
1211 sw $
2,9*4($
4) # r[9]=c1;
1217 multu $
20,$
17 # mul_add_c(a[6],b[4],c2,c3,c1);
1225 multu $
18,$
19 # mul_add_c(a[5],b[5],c2,c3,c1);
1234 multu $
16,$
21 # mul_add_c(a[4],b[6],c2,c3,c1);
1243 multu $
15,$
6 # mul_add_c(a[3],b[7],c2,c3,c1);
1252 multu $
16,$
6 # mul_add_c(a[4],b[7],c3,c1,c2);
1257 sw $
3,10*4($
4) # r[10]=c2;
1263 multu $
18,$
21 # mul_add_c(a[5],b[6],c3,c1,c2);
1271 multu $
20,$
19 # mul_add_c(a[6],b[5],c3,c1,c2);
1280 multu $
5,$
17 # mul_add_c(a[7],b[4],c3,c1,c2);
1289 multu $
5,$
19 # mul_add_c(a[7],b[5],c1,c2,c3);
1294 sw $
7,11*4($
4) # r[11]=c3;
1300 multu $
20,$
21 # mul_add_c(a[6],b[6],c1,c2,c3);
1308 multu $
18,$
6 # mul_add_c(a[5],b[7],c1,c2,c3);
1317 multu $
20,$
6 # mul_add_c(a[6],b[7],c2,c3,c1);
1322 sw $
2,12*4($
4) # r[12]=c1;
1328 multu $
5,$
21 # mul_add_c(a[7],b[6],c2,c3,c1);
1336 multu $
5,$
6 # mul_add_c(a[7],b[7],c3,c1,c2);
1341 sw $
3,13*4($
4) # r[13]=c2;
1349 sw $
7,14*4($
4) # r[14]=c3;
1350 sw $
2,15*4($
4) # r[15]=c1;
1364 .globl bn_mul_comba4
1372 multu $
12,$
8 # mul_add_c(a[0],b[0],c1,c2,c3);
1381 multu $
12,$
9 # mul_add_c(a[0],b[1],c2,c3,c1);
1386 multu $
13,$
8 # mul_add_c(a[1],b[0],c2,c3,c1);
1392 multu $
14,$
8 # mul_add_c(a[2],b[0],c3,c1,c2);
1402 multu $
13,$
9 # mul_add_c(a[1],b[1],c3,c1,c2);
1409 multu $
12,$
10 # mul_add_c(a[0],b[2],c3,c1,c2);
1417 multu $
12,$
11 # mul_add_c(a[0],b[3],c1,c2,c3);
1428 multu $
13,$
10 # mul_add_c(a[1],b[2],c1,c2,c3);
1436 multu $
14,$
9 # mul_add_c(a[2],b[1],c1,c2,c3);
1445 multu $
15,$
8 # mul_add_c(a[3],b[0],c1,c2,c3);
1454 multu $
15,$
9 # mul_add_c(a[3],b[1],c2,c3,c1);
1465 multu $
14,$
10 # mul_add_c(a[2],b[2],c2,c3,c1);
1473 multu $
13,$
11 # mul_add_c(a[1],b[3],c2,c3,c1);
1482 multu $
14,$
11 # mul_add_c(a[2],b[3],c3,c1,c2);
1493 multu $
15,$
10 # mul_add_c(a[3],b[2],c3,c1,c2);
1501 multu $
15,$
11 # mul_add_c(a[3],b[3],c1,c2,c3);
1523 .globl bn_sqr_comba8
1532 multu $
12,$
12 # mul_add_c(a[0],b[0],c1,c2,c3);
1541 multu $
12,$
13 # mul_add_c2(a[0],b[1],c2,c3,c1);
1546 multu $
14,$
12 # mul_add_c2(a[2],b[0],c3,c1,c2);
1559 multu $
13,$
13 # mul_add_c(a[1],b[1],c3,c1,c2);
1573 multu $
12,$
15 # mul_add_c2(a[0],b[3],c1,c2,c3);
1584 multu $
13,$
14 # mul_add_c2(a[1],b[2],c1,c2,c3);
1598 multu $
8,$
12 # mul_add_c2(a[4],b[0],c2,c3,c1);
1615 multu $
15,$
13 # mul_add_c2(a[3],b[1],c2,c3,c1);
1629 multu $
14,$
14 # mul_add_c(a[2],b[2],c2,c3,c1);
1644 multu $
12,$
9 # mul_add_c2(a[0],b[5],c3,c1,c2);
1655 multu $
13,$
8 # mul_add_c2(a[1],b[4],c3,c1,c2);
1669 multu $
14,$
15 # mul_add_c2(a[2],b[3],c3,c1,c2);
1683 multu $
10,$
12 # mul_add_c2(a[6],b[0],c1,c2,c3);
1701 multu $
9,$
13 # mul_add_c2(a[5],b[1],c1,c2,c3);
1715 multu $
8,$
14 # mul_add_c2(a[4],b[2],c1,c2,c3);
1730 multu $
15,$
15 # mul_add_c(a[3],b[3],c1,c2,c3);
1745 multu $
12,$
11 # mul_add_c2(a[0],b[7],c2,c3,c1);
1756 multu $
13,$
10 # mul_add_c2(a[1],b[6],c2,c3,c1);
1770 multu $
14,$
9 # mul_add_c2(a[2],b[5],c2,c3,c1);
1785 multu $
15,$
8 # mul_add_c2(a[3],b[4],c2,c3,c1);
1800 multu $
11,$
13 # mul_add_c2(a[7],b[1],c3,c1,c2);
1817 multu $
10,$
14 # mul_add_c2(a[6],b[2],c3,c1,c2);
1831 multu $
9,$
15 # mul_add_c2(a[5],b[3],c3,c1,c2);
1846 multu $
8,$
8 # mul_add_c(a[4],b[4],c3,c1,c2);
1861 multu $
14,$
11 # mul_add_c2(a[2],b[7],c1,c2,c3);
1872 multu $
15,$
10 # mul_add_c2(a[3],b[6],c1,c2,c3);
1886 multu $
8,$
9 # mul_add_c2(a[4],b[5],c1,c2,c3);
1901 multu $
11,$
15 # mul_add_c2(a[7],b[3],c2,c3,c1);
1918 multu $
10,$
8 # mul_add_c2(a[6],b[4],c2,c3,c1);
1932 multu $
9,$
9 # mul_add_c(a[5],b[5],c2,c3,c1);
1947 multu $
8,$
11 # mul_add_c2(a[4],b[7],c3,c1,c2);
1958 multu $
9,$
10 # mul_add_c2(a[5],b[6],c3,c1,c2);
1972 multu $
11,$
9 # mul_add_c2(a[7],b[5],c1,c2,c3);
1989 multu $
10,$
10 # mul_add_c(a[6],b[6],c1,c2,c3);
2003 multu $
10,$
11 # mul_add_c2(a[6],b[7],c2,c3,c1);
2014 multu $
11,$
11 # mul_add_c(a[7],b[7],c3,c1,c2);
2041 .globl bn_sqr_comba4
2047 multu $
12,$
12 # mul_add_c(a[0],b[0],c1,c2,c3);
2054 multu $
12,$
13 # mul_add_c2(a[0],b[1],c2,c3,c1);
2059 multu $
14,$
12 # mul_add_c2(a[2],b[0],c3,c1,c2);
2072 multu $
13,$
13 # mul_add_c(a[1],b[1],c3,c1,c2);
2086 multu $
12,$
15 # mul_add_c2(a[0],b[3],c1,c2,c3);
2097 multu $
13,$
14 # mul_add_c(a2[1],b[2],c1,c2,c3);
2111 multu $
15,$
13 # mul_add_c2(a[3],b[1],c2,c3,c1);
2128 multu $
14,$
14 # mul_add_c(a[2],b[2],c2,c3,c1);
2142 multu $
14,$
15 # mul_add_c2(a[2],b[3],c3,c1,c2);
2153 multu $
15,$
15 # mul_add_c(a[3],b[3],c1,c2,c3);