1 .ident "sparcv8.s, Version 1.4"
2 .ident "SPARC v8 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5 * ====================================================================
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
12 * ====================================================================
16 * This is my modest contributon to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in SuperSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
21 * See bn_asm.sparc.v8plus.S for more details.
27 * 1.1 - new loop unrolling model(*);
28 * 1.2 - made gas friendly;
29 * 1.3 - fixed problem with /usr/ccs/lib/cpp;
32 * (*) see bn_asm.sparc.v8plus.S for details
35 .section ".text",#alloc,#execinstr
36 .file "bn_asm.sparc.v8.S"
40 .global bn_mul_add_words
42 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
49 bg,a .L_bn_mul_add_words_proceed
54 .L_bn_mul_add_words_proceed:
56 bz .L_bn_mul_add_words_tail
59 .L_bn_mul_add_words_loop:
102 bnz,a .L_bn_mul_add_words_loop
106 bnz,a .L_bn_mul_add_words_tail
108 .L_bn_mul_add_words_return:
113 .L_bn_mul_add_words_tail:
122 bz .L_bn_mul_add_words_return
134 bz .L_bn_mul_add_words_return
148 .type bn_mul_add_words,#function
149 .size bn_mul_add_words,(.-bn_mul_add_words)
155 * BN_ULONG bn_mul_words(rp,ap,num,w)
162 bg,a .L_bn_mul_words_proceeed
167 .L_bn_mul_words_proceeed:
169 bz .L_bn_mul_words_tail
172 .L_bn_mul_words_loop:
204 bnz,a .L_bn_mul_words_loop
208 bnz,a .L_bn_mul_words_tail
210 .L_bn_mul_words_return:
215 .L_bn_mul_words_tail:
221 bz .L_bn_mul_words_return
231 bz .L_bn_mul_words_return
242 .type bn_mul_words,#function
243 .size bn_mul_words,(.-bn_mul_words)
248 * void bn_sqr_words(r,a,n)
254 bg,a .L_bn_sqr_words_proceeed
259 .L_bn_sqr_words_proceeed:
261 bz .L_bn_sqr_words_tail
264 .L_bn_sqr_words_loop:
292 bnz,a .L_bn_sqr_words_loop
297 bnz,a .L_bn_sqr_words_tail
299 .L_bn_sqr_words_return:
303 .L_bn_sqr_words_tail:
308 bz .L_bn_sqr_words_return
317 bz .L_bn_sqr_words_return
328 .type bn_sqr_words,#function
329 .size bn_sqr_words,(.-bn_sqr_words)
335 * BN_ULONG bn_div_words(h,l,d)
344 .type bn_div_words,#function
345 .size bn_div_words,(.-bn_div_words)
351 * BN_ULONG bn_add_words(rp,ap,bp,n)
352 * BN_ULONG *rp,*ap,*bp;
357 bg,a .L_bn_add_words_proceed
362 .L_bn_add_words_proceed:
364 bz .L_bn_add_words_tail
366 ba .L_bn_add_words_warn_loop
367 addcc %g0,0,%g0 ! clear carry flag
369 .L_bn_add_words_loop:
371 .L_bn_add_words_warn_loop:
396 bnz,a .L_bn_add_words_loop
400 bnz,a .L_bn_add_words_tail
402 .L_bn_add_words_return:
406 .L_bn_add_words_tail:
412 bz .L_bn_add_words_return
421 bz .L_bn_add_words_return
432 .type bn_add_words,#function
433 .size bn_add_words,(.-bn_add_words)
439 * BN_ULONG bn_sub_words(rp,ap,bp,n)
440 * BN_ULONG *rp,*ap,*bp;
445 bg,a .L_bn_sub_words_proceed
450 .L_bn_sub_words_proceed:
452 bz .L_bn_sub_words_tail
454 ba .L_bn_sub_words_warm_loop
455 addcc %g0,0,%g0 ! clear carry flag
457 .L_bn_sub_words_loop:
459 .L_bn_sub_words_warm_loop:
484 bnz,a .L_bn_sub_words_loop
489 bnz,a .L_bn_sub_words_tail
491 .L_bn_sub_words_return:
495 .L_bn_sub_words_tail:
501 bz .L_bn_sub_words_return
511 bz .L_bn_sub_words_return
522 .type bn_sub_words,#function
523 .size bn_sub_words,(.-bn_sub_words)
525 #define FRAME_SIZE -96
528 * Here is register usage map for *all* routines below.
536 #define ap(I) [%i1+4*I]
537 #define bp(I) [%i2+4*I]
538 #define rp(I) [%i0+4*I]
559 .global bn_mul_comba8
561 * void bn_mul_comba8(r,a,b)
565 save %sp,FRAME_SIZE,%sp
568 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
571 st c_1,rp(0) !r[0]=c1;
573 umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
577 addxcc %g0,t_2,c_3 !=
580 umul a_1,b_0,t_1 !mul_add_c(a[1],b[0],c2,c3,c1);
584 st c_2,rp(1) !r[1]=c2;
587 umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
590 addxcc c_1,t_2,c_1 !=
593 umul a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
599 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
602 addxcc c_1,t_2,c_1 !=
604 st c_3,rp(2) !r[2]=c3;
606 umul a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
611 umul a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
617 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
623 umul a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
628 st c_1,rp(3) !r[3]=c1;
630 umul a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
635 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
640 umul a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
646 umul a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
652 umul a_0,b_4,t_1 !=!mul_add_c(a[0],b[4],c2,c3,c1);
657 st c_2,rp(4) !r[4]=c2;
659 umul a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
664 umul a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
669 umul a_2,b_3,t_1 !=!mul_add_c(a[2],b[3],c3,c1,c2);
674 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
677 addxcc c_1,t_2,c_1 !=
680 umul a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
686 umul a_5,b_0,t_1 !mul_add_c(a[5],b[0],c3,c1,c2);
689 addxcc c_1,t_2,c_1 !=
691 st c_3,rp(5) !r[5]=c3;
693 umul a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
698 umul a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
703 umul a_4,b_2,t_1 !mul_add_c(a[4],b[2],c1,c2,c3);
706 addxcc c_2,t_2,c_2 !=
708 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
713 umul a_2,b_4,t_1 !mul_add_c(a[2],b[4],c1,c2,c3);
719 umul a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
722 addxcc c_2,t_2,c_2 !=
725 umul a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
729 st c_1,rp(6) !r[6]=c1;
732 umul a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
735 addxcc c_3,t_2,c_3 !=
737 umul a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
742 umul a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
747 umul a_3,b_4,t_1 !=!mul_add_c(a[3],b[4],c2,c3,c1);
752 umul a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
755 addxcc c_3,t_2,c_3 !=
757 umul a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
763 umul a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
768 umul a_7,b_0,t_1 !mul_add_c(a[7],b[0],c2,c3,c1);
771 addxcc c_3,t_2,c_3 !=
773 st c_2,rp(7) !r[7]=c2;
775 umul a_7,b_1,t_1 !mul_add_c(a[7],b[1],c3,c1,c2);
780 umul a_6,b_2,t_1 !=!mul_add_c(a[6],b[2],c3,c1,c2);
785 umul a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
788 addxcc c_1,t_2,c_1 !=
790 umul a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
795 umul a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
800 umul a_2,b_6,t_1 !=!mul_add_c(a[2],b[6],c3,c1,c2);
805 umul a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
810 st c_3,rp(8) !r[8]=c3;
812 umul a_2,b_7,t_1 !mul_add_c(a[2],b[7],c1,c2,c3);
817 umul a_3,b_6,t_1 !=!mul_add_c(a[3],b[6],c1,c2,c3);
822 umul a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
825 addxcc c_2,t_2,c_2 !=
827 umul a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
832 umul a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
837 umul a_7,b_2,t_1 !=!mul_add_c(a[7],b[2],c1,c2,c3);
842 st c_1,rp(9) !r[9]=c1;
844 umul a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
849 umul a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
854 umul a_5,b_5,t_1 !=!mul_add_c(a[5],b[5],c2,c3,c1);
859 umul a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
862 addxcc c_3,t_2,c_3 !=
864 umul a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
869 st c_2,rp(10) !r[10]=c2;
871 umul a_4,b_7,t_1 !=!mul_add_c(a[4],b[7],c3,c1,c2);
876 umul a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
879 addxcc c_1,t_2,c_1 !=
881 umul a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
886 umul a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
890 st c_3,rp(11) !r[11]=c3;
893 umul a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
896 addxcc c_2,t_2,c_2 !=
898 umul a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
903 umul a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
907 st c_1,rp(12) !r[12]=c1;
910 umul a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
913 addxcc c_3,t_2,c_3 !=
915 umul a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
920 st c_2,rp(13) !r[13]=c2;
922 umul a_7,b_7,t_1 !=!mul_add_c(a[7],b[7],c3,c1,c2);
927 st c_3,rp(14) !r[14]=c3;
928 st c_1,rp(15) !r[15]=c1;
933 .type bn_mul_comba8,#function
934 .size bn_mul_comba8,(.-bn_mul_comba8)
938 .global bn_mul_comba4
940 * void bn_mul_comba4(r,a,b)
944 save %sp,FRAME_SIZE,%sp
947 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
950 st c_1,rp(0) !r[0]=c1;
952 umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
959 umul a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
964 st c_2,rp(1) !r[1]=c2;
966 umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
972 umul a_1,b_1,t_1 !=!mul_add_c(a[1],b[1],c3,c1,c2);
978 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
983 st c_3,rp(2) !r[2]=c3;
985 umul a_0,b_3,t_1 !=!mul_add_c(a[0],b[3],c1,c2,c3);
990 umul a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
993 addxcc c_2,t_2,c_2 !=
996 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
1001 umul a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);
1006 st c_1,rp(3) !r[3]=c1;
1008 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
1013 umul a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
1014 addcc c_2,t_1,c_2 !=
1018 umul a_1,b_3,t_1 !=!mul_add_c(a[1],b[3],c2,c3,c1);
1023 st c_2,rp(4) !r[4]=c2;
1025 umul a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
1030 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
1031 addcc c_3,t_1,c_3 !=
1034 st c_3,rp(5) !r[5]=c3;
1037 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
1040 addxcc c_2,t_2,c_2 !=
1041 st c_1,rp(6) !r[6]=c1;
1042 st c_2,rp(7) !r[7]=c2;
1047 .type bn_mul_comba4,#function
1048 .size bn_mul_comba4,(.-bn_mul_comba4)
1052 .global bn_sqr_comba8
1054 save %sp,FRAME_SIZE,%sp
1057 umul a_0,a_0,c_1 !=!sqr_add_c(a,0,c1,c2,c3);
1059 st c_1,rp(0) !r[0]=c1;
1062 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1069 st c_2,rp(1) !r[1]=c2;
1072 umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1075 addxcc c_1,t_2,c_1 !=
1081 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1086 st c_3,rp(2) !r[2]=c3;
1088 umul a_0,a_3,t_1 !=!sqr_add_c2(a,3,0,c1,c2,c3);
1097 umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1100 addxcc c_2,t_2,c_2 !=
1105 st c_1,rp(3) !r[3]=c1;
1107 umul a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
1113 addxcc c_3,t_2,c_3 !=
1115 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1121 addxcc c_3,t_2,c_3 !=
1124 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1125 addcc c_2,t_1,c_2 !=
1128 st c_2,rp(4) !r[4]=c2;
1131 umul a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
1134 addxcc c_1,t_2,c_1 !=
1139 umul a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
1142 addxcc c_1,t_2,c_1 !=
1148 umul a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1154 addxcc c_1,t_2,c_1 !=
1156 st c_3,rp(5) !r[5]=c3;
1158 umul a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
1159 addcc c_1,t_1,c_1 !=
1163 addcc c_1,t_1,c_1 !=
1166 umul a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
1167 addcc c_1,t_1,c_1 !=
1171 addcc c_1,t_1,c_1 !=
1174 umul a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
1175 addcc c_1,t_1,c_1 !=
1179 addcc c_1,t_1,c_1 !=
1183 umul a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
1188 st c_1,rp(6) !r[6]=c1;
1190 umul a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
1196 addxcc c_3,t_2,c_3 !=
1198 umul a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
1204 addxcc c_3,t_2,c_3 !=
1206 umul a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
1212 addxcc c_3,t_2,c_3 !=
1214 umul a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
1220 addxcc c_3,t_2,c_3 !=
1222 st c_2,rp(7) !r[7]=c2;
1224 umul a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
1225 addcc c_3,t_1,c_3 !=
1229 addcc c_3,t_1,c_3 !=
1232 umul a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
1233 addcc c_3,t_1,c_3 !=
1237 addcc c_3,t_1,c_3 !=
1240 umul a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
1241 addcc c_3,t_1,c_3 !=
1245 addcc c_3,t_1,c_3 !=
1248 umul a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
1249 addcc c_3,t_1,c_3 !=
1252 st c_3,rp(8) !r[8]=c3;
1255 umul a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
1258 addxcc c_2,t_2,c_2 !=
1263 umul a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
1266 addxcc c_2,t_2,c_2 !=
1271 umul a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
1274 addxcc c_2,t_2,c_2 !=
1279 st c_1,rp(9) !r[9]=c1;
1281 umul a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
1287 addxcc c_3,t_2,c_3 !=
1289 umul a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
1295 addxcc c_3,t_2,c_3 !=
1297 umul a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
1302 st c_2,rp(10) !r[10]=c2;
1304 umul a_4,a_7,t_1 !=!sqr_add_c2(a,7,4,c3,c1,c2);
1312 umul a_5,a_6,t_1 !=!sqr_add_c2(a,6,5,c3,c1,c2);
1319 st c_3,rp(11) !r[11]=c3;
1322 umul a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
1325 addxcc c_2,t_2,c_2 !=
1330 umul a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
1333 addxcc c_2,t_2,c_2 !=
1335 st c_1,rp(12) !r[12]=c1;
1337 umul a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
1338 addcc c_2,t_1,c_2 !=
1342 addcc c_2,t_1,c_2 !=
1344 st c_2,rp(13) !r[13]=c2;
1347 umul a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
1350 addxcc c_1,t_2,c_1 !=
1351 st c_3,rp(14) !r[14]=c3;
1352 st c_1,rp(15) !r[15]=c1;
1357 .type bn_sqr_comba8,#function
1358 .size bn_sqr_comba8,(.-bn_sqr_comba8)
1362 .global bn_sqr_comba4
1364 * void bn_sqr_comba4(r,a)
1368 save %sp,FRAME_SIZE,%sp
1370 umul a_0,a_0,c_1 !sqr_add_c(a,0,c1,c2,c3);
1373 st c_1,rp(0) !r[0]=c1;
1376 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1384 st c_2,rp(1) !r[1]=c2;
1386 umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1392 addxcc c_1,t_2,c_1 !=
1395 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1396 addcc c_3,t_1,c_3 !=
1399 st c_3,rp(2) !r[2]=c3;
1402 umul a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1405 addxcc c_2,t_2,c_2 !=
1410 umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1413 addxcc c_2,t_2,c_2 !=
1418 st c_1,rp(3) !r[3]=c1;
1420 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1426 addxcc c_3,t_2,c_3 !=
1428 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1433 st c_2,rp(4) !r[4]=c2;
1435 umul a_2,a_3,t_1 !=!sqr_add_c2(a,3,2,c3,c1,c2);
1442 st c_3,rp(5) !r[5]=c3;
1445 umul a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
1448 addxcc c_2,t_2,c_2 !=
1449 st c_1,rp(6) !r[6]=c1;
1450 st c_2,rp(7) !r[7]=c2;
1455 .type bn_sqr_comba4,#function
1456 .size bn_sqr_comba4,(.-bn_sqr_comba4)