OpenSSL 1.0.2g
[tomato.git] / release / src / router / openssl / crypto / bn / asm / mips.pl
blobacafde5e5685ccb7e2d59b218d45d1bda0572fbd
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project.
7 # Rights for redistribution and usage in source and binary forms are
8 # granted according to the OpenSSL license. Warranty of any kind is
9 # disclaimed.
10 # ====================================================================
13 # July 1999
15 # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
17 # The module is designed to work with either of the "new" MIPS ABI(5),
18 # namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
19 # IRIX 5.x not only because it doesn't support new ABIs but also
20 # because 5.x kernels put R4x00 CPU into 32-bit mode and all those
21 # 64-bit instructions (daddu, dmultu, etc.) found below gonna only
22 # cause illegal instruction exception:-(
24 # In addition the code depends on preprocessor flags set up by MIPSpro
25 # compiler driver (either as or cc) and therefore (probably?) can't be
26 # compiled by the GNU assembler. GNU C driver manages fine though...
27 # I mean as long as -mmips-as is specified or is the default option,
28 # because then it simply invokes /usr/bin/as which in turn takes
29 # perfect care of the preprocessor definitions. Another neat feature
30 # offered by the MIPSpro assembler is an optimization pass. This gave
31 # me the opportunity to have the code looking more regular as all those
32 # architecture dependent instruction rescheduling details were left to
33 # the assembler. Cool, huh?
35 # Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
36 # goes way over 3 times faster!
38 # <appro@fy.chalmers.se>
40 # October 2010
42 # Adapt the module even for 32-bit ABIs and other OSes. The former was
43 # achieved by mechanical replacement of 64-bit arithmetic instructions
44 # such as dmultu, daddu, etc. with their 32-bit counterparts and
45 # adjusting offsets denoting multiples of BN_ULONG. Above mentioned
46 # >3x performance improvement naturally does not apply to 32-bit code
47 # [because there is no instruction 32-bit compiler can't use], one
48 # has to content with 40-85% improvement depending on benchmark and
49 # key length, more for longer keys.
51 $flavour = shift || "o32";
52 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
53 open STDOUT,">$output";
55 if ($flavour =~ /64|n32/i) {
56 $LD="ld";
57 $ST="sd";
58 $MULTU="dmultu";
59 $DIVU="ddivu";
60 $ADDU="daddu";
61 $SUBU="dsubu";
62 $SRL="dsrl";
63 $SLL="dsll";
64 $BNSZ=8;
65 $PTR_ADD="daddu";
66 $PTR_SUB="dsubu";
67 $SZREG=8;
68 $REG_S="sd";
69 $REG_L="ld";
70 } else {
71 $LD="lw";
72 $ST="sw";
73 $MULTU="multu";
74 $DIVU="divu";
75 $ADDU="addu";
76 $SUBU="subu";
77 $SRL="srl";
78 $SLL="sll";
79 $BNSZ=4;
80 $PTR_ADD="addu";
81 $PTR_SUB="subu";
82 $SZREG=4;
83 $REG_S="sw";
84 $REG_L="lw";
85 $code=".set mips2\n";
88 # Below is N32/64 register layout used in the original module.
90 ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
91 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
92 ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
93 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
94 ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
95 ($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
97 # No special adaptation is required for O32. NUBI on the other hand
98 # is treated by saving/restoring ($v1,$t0..$t3).
100 $gp=$v1 if ($flavour =~ /nubi/i);
102 $minus4=$v1;
104 $code.=<<___;
105 .rdata
106 .asciiz "mips3.s, Version 1.2"
107 .asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
109 .text
110 .set noat
112 .align 5
113 .globl bn_mul_add_words
114 .ent bn_mul_add_words
115 bn_mul_add_words:
116 .set noreorder
117 bgtz $a2,bn_mul_add_words_internal
118 move $v0,$zero
119 jr $ra
120 move $a0,$v0
121 .end bn_mul_add_words
123 .align 5
124 .ent bn_mul_add_words_internal
125 bn_mul_add_words_internal:
127 $code.=<<___ if ($flavour =~ /nubi/i);
128 .frame $sp,6*$SZREG,$ra
129 .mask 0x8000f008,-$SZREG
130 .set noreorder
131 $PTR_SUB $sp,6*$SZREG
132 $REG_S $ra,5*$SZREG($sp)
133 $REG_S $t3,4*$SZREG($sp)
134 $REG_S $t2,3*$SZREG($sp)
135 $REG_S $t1,2*$SZREG($sp)
136 $REG_S $t0,1*$SZREG($sp)
137 $REG_S $gp,0*$SZREG($sp)
139 $code.=<<___;
140 .set reorder
141 li $minus4,-4
142 and $ta0,$a2,$minus4
143 beqz $ta0,.L_bn_mul_add_words_tail
145 .L_bn_mul_add_words_loop:
146 $LD $t0,0($a1)
147 $MULTU $t0,$a3
148 $LD $t1,0($a0)
149 $LD $t2,$BNSZ($a1)
150 $LD $t3,$BNSZ($a0)
151 $LD $ta0,2*$BNSZ($a1)
152 $LD $ta1,2*$BNSZ($a0)
153 $ADDU $t1,$v0
154 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
155 # values", but it seems to work fine
156 # even on 64-bit registers.
157 mflo $at
158 mfhi $t0
159 $ADDU $t1,$at
160 $ADDU $v0,$t0
161 $MULTU $t2,$a3
162 sltu $at,$t1,$at
163 $ST $t1,0($a0)
164 $ADDU $v0,$at
166 $LD $ta2,3*$BNSZ($a1)
167 $LD $ta3,3*$BNSZ($a0)
168 $ADDU $t3,$v0
169 sltu $v0,$t3,$v0
170 mflo $at
171 mfhi $t2
172 $ADDU $t3,$at
173 $ADDU $v0,$t2
174 $MULTU $ta0,$a3
175 sltu $at,$t3,$at
176 $ST $t3,$BNSZ($a0)
177 $ADDU $v0,$at
179 subu $a2,4
180 $PTR_ADD $a0,4*$BNSZ
181 $PTR_ADD $a1,4*$BNSZ
182 $ADDU $ta1,$v0
183 sltu $v0,$ta1,$v0
184 mflo $at
185 mfhi $ta0
186 $ADDU $ta1,$at
187 $ADDU $v0,$ta0
188 $MULTU $ta2,$a3
189 sltu $at,$ta1,$at
190 $ST $ta1,-2*$BNSZ($a0)
191 $ADDU $v0,$at
194 and $ta0,$a2,$minus4
195 $ADDU $ta3,$v0
196 sltu $v0,$ta3,$v0
197 mflo $at
198 mfhi $ta2
199 $ADDU $ta3,$at
200 $ADDU $v0,$ta2
201 sltu $at,$ta3,$at
202 $ST $ta3,-$BNSZ($a0)
203 .set noreorder
204 bgtz $ta0,.L_bn_mul_add_words_loop
205 $ADDU $v0,$at
207 beqz $a2,.L_bn_mul_add_words_return
210 .L_bn_mul_add_words_tail:
211 .set reorder
212 $LD $t0,0($a1)
213 $MULTU $t0,$a3
214 $LD $t1,0($a0)
215 subu $a2,1
216 $ADDU $t1,$v0
217 sltu $v0,$t1,$v0
218 mflo $at
219 mfhi $t0
220 $ADDU $t1,$at
221 $ADDU $v0,$t0
222 sltu $at,$t1,$at
223 $ST $t1,0($a0)
224 $ADDU $v0,$at
225 beqz $a2,.L_bn_mul_add_words_return
227 $LD $t0,$BNSZ($a1)
228 $MULTU $t0,$a3
229 $LD $t1,$BNSZ($a0)
230 subu $a2,1
231 $ADDU $t1,$v0
232 sltu $v0,$t1,$v0
233 mflo $at
234 mfhi $t0
235 $ADDU $t1,$at
236 $ADDU $v0,$t0
237 sltu $at,$t1,$at
238 $ST $t1,$BNSZ($a0)
239 $ADDU $v0,$at
240 beqz $a2,.L_bn_mul_add_words_return
242 $LD $t0,2*$BNSZ($a1)
243 $MULTU $t0,$a3
244 $LD $t1,2*$BNSZ($a0)
245 $ADDU $t1,$v0
246 sltu $v0,$t1,$v0
247 mflo $at
248 mfhi $t0
249 $ADDU $t1,$at
250 $ADDU $v0,$t0
251 sltu $at,$t1,$at
252 $ST $t1,2*$BNSZ($a0)
253 $ADDU $v0,$at
255 .L_bn_mul_add_words_return:
256 .set noreorder
258 $code.=<<___ if ($flavour =~ /nubi/i);
259 $REG_L $t3,4*$SZREG($sp)
260 $REG_L $t2,3*$SZREG($sp)
261 $REG_L $t1,2*$SZREG($sp)
262 $REG_L $t0,1*$SZREG($sp)
263 $REG_L $gp,0*$SZREG($sp)
264 $PTR_ADD $sp,6*$SZREG
266 $code.=<<___;
267 jr $ra
268 move $a0,$v0
269 .end bn_mul_add_words_internal
271 .align 5
272 .globl bn_mul_words
273 .ent bn_mul_words
274 bn_mul_words:
275 .set noreorder
276 bgtz $a2,bn_mul_words_internal
277 move $v0,$zero
278 jr $ra
279 move $a0,$v0
280 .end bn_mul_words
282 .align 5
283 .ent bn_mul_words_internal
284 bn_mul_words_internal:
286 $code.=<<___ if ($flavour =~ /nubi/i);
287 .frame $sp,6*$SZREG,$ra
288 .mask 0x8000f008,-$SZREG
289 .set noreorder
290 $PTR_SUB $sp,6*$SZREG
291 $REG_S $ra,5*$SZREG($sp)
292 $REG_S $t3,4*$SZREG($sp)
293 $REG_S $t2,3*$SZREG($sp)
294 $REG_S $t1,2*$SZREG($sp)
295 $REG_S $t0,1*$SZREG($sp)
296 $REG_S $gp,0*$SZREG($sp)
298 $code.=<<___;
299 .set reorder
300 li $minus4,-4
301 and $ta0,$a2,$minus4
302 beqz $ta0,.L_bn_mul_words_tail
304 .L_bn_mul_words_loop:
305 $LD $t0,0($a1)
306 $MULTU $t0,$a3
307 $LD $t2,$BNSZ($a1)
308 $LD $ta0,2*$BNSZ($a1)
309 $LD $ta2,3*$BNSZ($a1)
310 mflo $at
311 mfhi $t0
312 $ADDU $v0,$at
313 sltu $t1,$v0,$at
314 $MULTU $t2,$a3
315 $ST $v0,0($a0)
316 $ADDU $v0,$t1,$t0
318 subu $a2,4
319 $PTR_ADD $a0,4*$BNSZ
320 $PTR_ADD $a1,4*$BNSZ
321 mflo $at
322 mfhi $t2
323 $ADDU $v0,$at
324 sltu $t3,$v0,$at
325 $MULTU $ta0,$a3
326 $ST $v0,-3*$BNSZ($a0)
327 $ADDU $v0,$t3,$t2
329 mflo $at
330 mfhi $ta0
331 $ADDU $v0,$at
332 sltu $ta1,$v0,$at
333 $MULTU $ta2,$a3
334 $ST $v0,-2*$BNSZ($a0)
335 $ADDU $v0,$ta1,$ta0
337 and $ta0,$a2,$minus4
338 mflo $at
339 mfhi $ta2
340 $ADDU $v0,$at
341 sltu $ta3,$v0,$at
342 $ST $v0,-$BNSZ($a0)
343 .set noreorder
344 bgtz $ta0,.L_bn_mul_words_loop
345 $ADDU $v0,$ta3,$ta2
347 beqz $a2,.L_bn_mul_words_return
350 .L_bn_mul_words_tail:
351 .set reorder
352 $LD $t0,0($a1)
353 $MULTU $t0,$a3
354 subu $a2,1
355 mflo $at
356 mfhi $t0
357 $ADDU $v0,$at
358 sltu $t1,$v0,$at
359 $ST $v0,0($a0)
360 $ADDU $v0,$t1,$t0
361 beqz $a2,.L_bn_mul_words_return
363 $LD $t0,$BNSZ($a1)
364 $MULTU $t0,$a3
365 subu $a2,1
366 mflo $at
367 mfhi $t0
368 $ADDU $v0,$at
369 sltu $t1,$v0,$at
370 $ST $v0,$BNSZ($a0)
371 $ADDU $v0,$t1,$t0
372 beqz $a2,.L_bn_mul_words_return
374 $LD $t0,2*$BNSZ($a1)
375 $MULTU $t0,$a3
376 mflo $at
377 mfhi $t0
378 $ADDU $v0,$at
379 sltu $t1,$v0,$at
380 $ST $v0,2*$BNSZ($a0)
381 $ADDU $v0,$t1,$t0
383 .L_bn_mul_words_return:
384 .set noreorder
386 $code.=<<___ if ($flavour =~ /nubi/i);
387 $REG_L $t3,4*$SZREG($sp)
388 $REG_L $t2,3*$SZREG($sp)
389 $REG_L $t1,2*$SZREG($sp)
390 $REG_L $t0,1*$SZREG($sp)
391 $REG_L $gp,0*$SZREG($sp)
392 $PTR_ADD $sp,6*$SZREG
394 $code.=<<___;
395 jr $ra
396 move $a0,$v0
397 .end bn_mul_words_internal
399 .align 5
400 .globl bn_sqr_words
401 .ent bn_sqr_words
402 bn_sqr_words:
403 .set noreorder
404 bgtz $a2,bn_sqr_words_internal
405 move $v0,$zero
406 jr $ra
407 move $a0,$v0
408 .end bn_sqr_words
410 .align 5
411 .ent bn_sqr_words_internal
412 bn_sqr_words_internal:
414 $code.=<<___ if ($flavour =~ /nubi/i);
415 .frame $sp,6*$SZREG,$ra
416 .mask 0x8000f008,-$SZREG
417 .set noreorder
418 $PTR_SUB $sp,6*$SZREG
419 $REG_S $ra,5*$SZREG($sp)
420 $REG_S $t3,4*$SZREG($sp)
421 $REG_S $t2,3*$SZREG($sp)
422 $REG_S $t1,2*$SZREG($sp)
423 $REG_S $t0,1*$SZREG($sp)
424 $REG_S $gp,0*$SZREG($sp)
426 $code.=<<___;
427 .set reorder
428 li $minus4,-4
429 and $ta0,$a2,$minus4
430 beqz $ta0,.L_bn_sqr_words_tail
432 .L_bn_sqr_words_loop:
433 $LD $t0,0($a1)
434 $MULTU $t0,$t0
435 $LD $t2,$BNSZ($a1)
436 $LD $ta0,2*$BNSZ($a1)
437 $LD $ta2,3*$BNSZ($a1)
438 mflo $t1
439 mfhi $t0
440 $ST $t1,0($a0)
441 $ST $t0,$BNSZ($a0)
443 $MULTU $t2,$t2
444 subu $a2,4
445 $PTR_ADD $a0,8*$BNSZ
446 $PTR_ADD $a1,4*$BNSZ
447 mflo $t3
448 mfhi $t2
449 $ST $t3,-6*$BNSZ($a0)
450 $ST $t2,-5*$BNSZ($a0)
452 $MULTU $ta0,$ta0
453 mflo $ta1
454 mfhi $ta0
455 $ST $ta1,-4*$BNSZ($a0)
456 $ST $ta0,-3*$BNSZ($a0)
459 $MULTU $ta2,$ta2
460 and $ta0,$a2,$minus4
461 mflo $ta3
462 mfhi $ta2
463 $ST $ta3,-2*$BNSZ($a0)
465 .set noreorder
466 bgtz $ta0,.L_bn_sqr_words_loop
467 $ST $ta2,-$BNSZ($a0)
469 beqz $a2,.L_bn_sqr_words_return
472 .L_bn_sqr_words_tail:
473 .set reorder
474 $LD $t0,0($a1)
475 $MULTU $t0,$t0
476 subu $a2,1
477 mflo $t1
478 mfhi $t0
479 $ST $t1,0($a0)
480 $ST $t0,$BNSZ($a0)
481 beqz $a2,.L_bn_sqr_words_return
483 $LD $t0,$BNSZ($a1)
484 $MULTU $t0,$t0
485 subu $a2,1
486 mflo $t1
487 mfhi $t0
488 $ST $t1,2*$BNSZ($a0)
489 $ST $t0,3*$BNSZ($a0)
490 beqz $a2,.L_bn_sqr_words_return
492 $LD $t0,2*$BNSZ($a1)
493 $MULTU $t0,$t0
494 mflo $t1
495 mfhi $t0
496 $ST $t1,4*$BNSZ($a0)
497 $ST $t0,5*$BNSZ($a0)
499 .L_bn_sqr_words_return:
500 .set noreorder
502 $code.=<<___ if ($flavour =~ /nubi/i);
503 $REG_L $t3,4*$SZREG($sp)
504 $REG_L $t2,3*$SZREG($sp)
505 $REG_L $t1,2*$SZREG($sp)
506 $REG_L $t0,1*$SZREG($sp)
507 $REG_L $gp,0*$SZREG($sp)
508 $PTR_ADD $sp,6*$SZREG
510 $code.=<<___;
511 jr $ra
512 move $a0,$v0
514 .end bn_sqr_words_internal
516 .align 5
517 .globl bn_add_words
518 .ent bn_add_words
519 bn_add_words:
520 .set noreorder
521 bgtz $a3,bn_add_words_internal
522 move $v0,$zero
523 jr $ra
524 move $a0,$v0
525 .end bn_add_words
527 .align 5
528 .ent bn_add_words_internal
529 bn_add_words_internal:
531 $code.=<<___ if ($flavour =~ /nubi/i);
532 .frame $sp,6*$SZREG,$ra
533 .mask 0x8000f008,-$SZREG
534 .set noreorder
535 $PTR_SUB $sp,6*$SZREG
536 $REG_S $ra,5*$SZREG($sp)
537 $REG_S $t3,4*$SZREG($sp)
538 $REG_S $t2,3*$SZREG($sp)
539 $REG_S $t1,2*$SZREG($sp)
540 $REG_S $t0,1*$SZREG($sp)
541 $REG_S $gp,0*$SZREG($sp)
543 $code.=<<___;
544 .set reorder
545 li $minus4,-4
546 and $at,$a3,$minus4
547 beqz $at,.L_bn_add_words_tail
549 .L_bn_add_words_loop:
550 $LD $t0,0($a1)
551 $LD $ta0,0($a2)
552 subu $a3,4
553 $LD $t1,$BNSZ($a1)
554 and $at,$a3,$minus4
555 $LD $t2,2*$BNSZ($a1)
556 $PTR_ADD $a2,4*$BNSZ
557 $LD $t3,3*$BNSZ($a1)
558 $PTR_ADD $a0,4*$BNSZ
559 $LD $ta1,-3*$BNSZ($a2)
560 $PTR_ADD $a1,4*$BNSZ
561 $LD $ta2,-2*$BNSZ($a2)
562 $LD $ta3,-$BNSZ($a2)
563 $ADDU $ta0,$t0
564 sltu $t8,$ta0,$t0
565 $ADDU $t0,$ta0,$v0
566 sltu $v0,$t0,$ta0
567 $ST $t0,-4*$BNSZ($a0)
568 $ADDU $v0,$t8
570 $ADDU $ta1,$t1
571 sltu $t9,$ta1,$t1
572 $ADDU $t1,$ta1,$v0
573 sltu $v0,$t1,$ta1
574 $ST $t1,-3*$BNSZ($a0)
575 $ADDU $v0,$t9
577 $ADDU $ta2,$t2
578 sltu $t8,$ta2,$t2
579 $ADDU $t2,$ta2,$v0
580 sltu $v0,$t2,$ta2
581 $ST $t2,-2*$BNSZ($a0)
582 $ADDU $v0,$t8
584 $ADDU $ta3,$t3
585 sltu $t9,$ta3,$t3
586 $ADDU $t3,$ta3,$v0
587 sltu $v0,$t3,$ta3
588 $ST $t3,-$BNSZ($a0)
590 .set noreorder
591 bgtz $at,.L_bn_add_words_loop
592 $ADDU $v0,$t9
594 beqz $a3,.L_bn_add_words_return
597 .L_bn_add_words_tail:
598 .set reorder
599 $LD $t0,0($a1)
600 $LD $ta0,0($a2)
601 $ADDU $ta0,$t0
602 subu $a3,1
603 sltu $t8,$ta0,$t0
604 $ADDU $t0,$ta0,$v0
605 sltu $v0,$t0,$ta0
606 $ST $t0,0($a0)
607 $ADDU $v0,$t8
608 beqz $a3,.L_bn_add_words_return
610 $LD $t1,$BNSZ($a1)
611 $LD $ta1,$BNSZ($a2)
612 $ADDU $ta1,$t1
613 subu $a3,1
614 sltu $t9,$ta1,$t1
615 $ADDU $t1,$ta1,$v0
616 sltu $v0,$t1,$ta1
617 $ST $t1,$BNSZ($a0)
618 $ADDU $v0,$t9
619 beqz $a3,.L_bn_add_words_return
621 $LD $t2,2*$BNSZ($a1)
622 $LD $ta2,2*$BNSZ($a2)
623 $ADDU $ta2,$t2
624 sltu $t8,$ta2,$t2
625 $ADDU $t2,$ta2,$v0
626 sltu $v0,$t2,$ta2
627 $ST $t2,2*$BNSZ($a0)
628 $ADDU $v0,$t8
630 .L_bn_add_words_return:
631 .set noreorder
633 $code.=<<___ if ($flavour =~ /nubi/i);
634 $REG_L $t3,4*$SZREG($sp)
635 $REG_L $t2,3*$SZREG($sp)
636 $REG_L $t1,2*$SZREG($sp)
637 $REG_L $t0,1*$SZREG($sp)
638 $REG_L $gp,0*$SZREG($sp)
639 $PTR_ADD $sp,6*$SZREG
641 $code.=<<___;
642 jr $ra
643 move $a0,$v0
645 .end bn_add_words_internal
647 .align 5
648 .globl bn_sub_words
649 .ent bn_sub_words
650 bn_sub_words:
651 .set noreorder
652 bgtz $a3,bn_sub_words_internal
653 move $v0,$zero
654 jr $ra
655 move $a0,$zero
656 .end bn_sub_words
658 .align 5
659 .ent bn_sub_words_internal
660 bn_sub_words_internal:
662 $code.=<<___ if ($flavour =~ /nubi/i);
663 .frame $sp,6*$SZREG,$ra
664 .mask 0x8000f008,-$SZREG
665 .set noreorder
666 $PTR_SUB $sp,6*$SZREG
667 $REG_S $ra,5*$SZREG($sp)
668 $REG_S $t3,4*$SZREG($sp)
669 $REG_S $t2,3*$SZREG($sp)
670 $REG_S $t1,2*$SZREG($sp)
671 $REG_S $t0,1*$SZREG($sp)
672 $REG_S $gp,0*$SZREG($sp)
674 $code.=<<___;
675 .set reorder
676 li $minus4,-4
677 and $at,$a3,$minus4
678 beqz $at,.L_bn_sub_words_tail
680 .L_bn_sub_words_loop:
681 $LD $t0,0($a1)
682 $LD $ta0,0($a2)
683 subu $a3,4
684 $LD $t1,$BNSZ($a1)
685 and $at,$a3,$minus4
686 $LD $t2,2*$BNSZ($a1)
687 $PTR_ADD $a2,4*$BNSZ
688 $LD $t3,3*$BNSZ($a1)
689 $PTR_ADD $a0,4*$BNSZ
690 $LD $ta1,-3*$BNSZ($a2)
691 $PTR_ADD $a1,4*$BNSZ
692 $LD $ta2,-2*$BNSZ($a2)
693 $LD $ta3,-$BNSZ($a2)
694 sltu $t8,$t0,$ta0
695 $SUBU $ta0,$t0,$ta0
696 $SUBU $t0,$ta0,$v0
697 sgtu $v0,$t0,$ta0
698 $ST $t0,-4*$BNSZ($a0)
699 $ADDU $v0,$t8
701 sltu $t9,$t1,$ta1
702 $SUBU $ta1,$t1,$ta1
703 $SUBU $t1,$ta1,$v0
704 sgtu $v0,$t1,$ta1
705 $ST $t1,-3*$BNSZ($a0)
706 $ADDU $v0,$t9
709 sltu $t8,$t2,$ta2
710 $SUBU $ta2,$t2,$ta2
711 $SUBU $t2,$ta2,$v0
712 sgtu $v0,$t2,$ta2
713 $ST $t2,-2*$BNSZ($a0)
714 $ADDU $v0,$t8
716 sltu $t9,$t3,$ta3
717 $SUBU $ta3,$t3,$ta3
718 $SUBU $t3,$ta3,$v0
719 sgtu $v0,$t3,$ta3
720 $ST $t3,-$BNSZ($a0)
722 .set noreorder
723 bgtz $at,.L_bn_sub_words_loop
724 $ADDU $v0,$t9
726 beqz $a3,.L_bn_sub_words_return
729 .L_bn_sub_words_tail:
730 .set reorder
731 $LD $t0,0($a1)
732 $LD $ta0,0($a2)
733 subu $a3,1
734 sltu $t8,$t0,$ta0
735 $SUBU $ta0,$t0,$ta0
736 $SUBU $t0,$ta0,$v0
737 sgtu $v0,$t0,$ta0
738 $ST $t0,0($a0)
739 $ADDU $v0,$t8
740 beqz $a3,.L_bn_sub_words_return
742 $LD $t1,$BNSZ($a1)
743 subu $a3,1
744 $LD $ta1,$BNSZ($a2)
745 sltu $t9,$t1,$ta1
746 $SUBU $ta1,$t1,$ta1
747 $SUBU $t1,$ta1,$v0
748 sgtu $v0,$t1,$ta1
749 $ST $t1,$BNSZ($a0)
750 $ADDU $v0,$t9
751 beqz $a3,.L_bn_sub_words_return
753 $LD $t2,2*$BNSZ($a1)
754 $LD $ta2,2*$BNSZ($a2)
755 sltu $t8,$t2,$ta2
756 $SUBU $ta2,$t2,$ta2
757 $SUBU $t2,$ta2,$v0
758 sgtu $v0,$t2,$ta2
759 $ST $t2,2*$BNSZ($a0)
760 $ADDU $v0,$t8
762 .L_bn_sub_words_return:
763 .set noreorder
765 $code.=<<___ if ($flavour =~ /nubi/i);
766 $REG_L $t3,4*$SZREG($sp)
767 $REG_L $t2,3*$SZREG($sp)
768 $REG_L $t1,2*$SZREG($sp)
769 $REG_L $t0,1*$SZREG($sp)
770 $REG_L $gp,0*$SZREG($sp)
771 $PTR_ADD $sp,6*$SZREG
773 $code.=<<___;
774 jr $ra
775 move $a0,$v0
776 .end bn_sub_words_internal
778 .align 5
779 .globl bn_div_3_words
780 .ent bn_div_3_words
781 bn_div_3_words:
782 .set noreorder
783 move $a3,$a0 # we know that bn_div_words does not
784 # touch $a3, $ta2, $ta3 and preserves $a2
785 # so that we can save two arguments
786 # and return address in registers
787 # instead of stack:-)
789 $LD $a0,($a3)
790 move $ta2,$a1
791 bne $a0,$a2,bn_div_3_words_internal
792 $LD $a1,-$BNSZ($a3)
793 li $v0,-1
794 jr $ra
795 move $a0,$v0
796 .end bn_div_3_words
798 .align 5
799 .ent bn_div_3_words_internal
800 bn_div_3_words_internal:
802 $code.=<<___ if ($flavour =~ /nubi/i);
803 .frame $sp,6*$SZREG,$ra
804 .mask 0x8000f008,-$SZREG
805 .set noreorder
806 $PTR_SUB $sp,6*$SZREG
807 $REG_S $ra,5*$SZREG($sp)
808 $REG_S $t3,4*$SZREG($sp)
809 $REG_S $t2,3*$SZREG($sp)
810 $REG_S $t1,2*$SZREG($sp)
811 $REG_S $t0,1*$SZREG($sp)
812 $REG_S $gp,0*$SZREG($sp)
814 $code.=<<___;
815 .set reorder
816 move $ta3,$ra
817 bal bn_div_words_internal
818 move $ra,$ta3
819 $MULTU $ta2,$v0
820 $LD $t2,-2*$BNSZ($a3)
821 move $ta0,$zero
822 mfhi $t1
823 mflo $t0
824 sltu $t8,$t1,$a1
825 .L_bn_div_3_words_inner_loop:
826 bnez $t8,.L_bn_div_3_words_inner_loop_done
827 sgeu $at,$t2,$t0
828 seq $t9,$t1,$a1
829 and $at,$t9
830 sltu $t3,$t0,$ta2
831 $ADDU $a1,$a2
832 $SUBU $t1,$t3
833 $SUBU $t0,$ta2
834 sltu $t8,$t1,$a1
835 sltu $ta0,$a1,$a2
836 or $t8,$ta0
837 .set noreorder
838 beqz $at,.L_bn_div_3_words_inner_loop
839 $SUBU $v0,1
840 $ADDU $v0,1
841 .set reorder
842 .L_bn_div_3_words_inner_loop_done:
843 .set noreorder
845 $code.=<<___ if ($flavour =~ /nubi/i);
846 $REG_L $t3,4*$SZREG($sp)
847 $REG_L $t2,3*$SZREG($sp)
848 $REG_L $t1,2*$SZREG($sp)
849 $REG_L $t0,1*$SZREG($sp)
850 $REG_L $gp,0*$SZREG($sp)
851 $PTR_ADD $sp,6*$SZREG
853 $code.=<<___;
854 jr $ra
855 move $a0,$v0
856 .end bn_div_3_words_internal
858 .align 5
859 .globl bn_div_words
860 .ent bn_div_words
861 bn_div_words:
862 .set noreorder
863 bnez $a2,bn_div_words_internal
864 li $v0,-1 # I would rather signal div-by-zero
865 # which can be done with 'break 7'
866 jr $ra
867 move $a0,$v0
868 .end bn_div_words
870 .align 5
871 .ent bn_div_words_internal
872 bn_div_words_internal:
874 $code.=<<___ if ($flavour =~ /nubi/i);
875 .frame $sp,6*$SZREG,$ra
876 .mask 0x8000f008,-$SZREG
877 .set noreorder
878 $PTR_SUB $sp,6*$SZREG
879 $REG_S $ra,5*$SZREG($sp)
880 $REG_S $t3,4*$SZREG($sp)
881 $REG_S $t2,3*$SZREG($sp)
882 $REG_S $t1,2*$SZREG($sp)
883 $REG_S $t0,1*$SZREG($sp)
884 $REG_S $gp,0*$SZREG($sp)
886 $code.=<<___;
887 move $v1,$zero
888 bltz $a2,.L_bn_div_words_body
889 move $t9,$v1
890 $SLL $a2,1
891 bgtz $a2,.-4
892 addu $t9,1
894 .set reorder
895 negu $t1,$t9
896 li $t2,-1
897 $SLL $t2,$t1
898 and $t2,$a0
899 $SRL $at,$a1,$t1
900 .set noreorder
901 beqz $t2,.+12
903 break 6 # signal overflow
904 .set reorder
905 $SLL $a0,$t9
906 $SLL $a1,$t9
907 or $a0,$at
909 $QT=$ta0;
910 $HH=$ta1;
911 $DH=$v1;
912 $code.=<<___;
913 .L_bn_div_words_body:
914 $SRL $DH,$a2,4*$BNSZ # bits
915 sgeu $at,$a0,$a2
916 .set noreorder
917 beqz $at,.+12
919 $SUBU $a0,$a2
920 .set reorder
922 li $QT,-1
923 $SRL $HH,$a0,4*$BNSZ # bits
924 $SRL $QT,4*$BNSZ # q=0xffffffff
925 beq $DH,$HH,.L_bn_div_words_skip_div1
926 $DIVU $zero,$a0,$DH
927 mflo $QT
928 .L_bn_div_words_skip_div1:
929 $MULTU $a2,$QT
930 $SLL $t3,$a0,4*$BNSZ # bits
931 $SRL $at,$a1,4*$BNSZ # bits
932 or $t3,$at
933 mflo $t0
934 mfhi $t1
935 .L_bn_div_words_inner_loop1:
936 sltu $t2,$t3,$t0
937 seq $t8,$HH,$t1
938 sltu $at,$HH,$t1
939 and $t2,$t8
940 sltu $v0,$t0,$a2
941 or $at,$t2
942 .set noreorder
943 beqz $at,.L_bn_div_words_inner_loop1_done
944 $SUBU $t1,$v0
945 $SUBU $t0,$a2
946 b .L_bn_div_words_inner_loop1
947 $SUBU $QT,1
948 .set reorder
949 .L_bn_div_words_inner_loop1_done:
951 $SLL $a1,4*$BNSZ # bits
952 $SUBU $a0,$t3,$t0
953 $SLL $v0,$QT,4*$BNSZ # bits
955 li $QT,-1
956 $SRL $HH,$a0,4*$BNSZ # bits
957 $SRL $QT,4*$BNSZ # q=0xffffffff
958 beq $DH,$HH,.L_bn_div_words_skip_div2
959 $DIVU $zero,$a0,$DH
960 mflo $QT
961 .L_bn_div_words_skip_div2:
962 $MULTU $a2,$QT
963 $SLL $t3,$a0,4*$BNSZ # bits
964 $SRL $at,$a1,4*$BNSZ # bits
965 or $t3,$at
966 mflo $t0
967 mfhi $t1
968 .L_bn_div_words_inner_loop2:
969 sltu $t2,$t3,$t0
970 seq $t8,$HH,$t1
971 sltu $at,$HH,$t1
972 and $t2,$t8
973 sltu $v1,$t0,$a2
974 or $at,$t2
975 .set noreorder
976 beqz $at,.L_bn_div_words_inner_loop2_done
977 $SUBU $t1,$v1
978 $SUBU $t0,$a2
979 b .L_bn_div_words_inner_loop2
980 $SUBU $QT,1
981 .set reorder
982 .L_bn_div_words_inner_loop2_done:
984 $SUBU $a0,$t3,$t0
985 or $v0,$QT
986 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
987 $SRL $a2,$t9 # restore $a2
989 .set noreorder
990 move $a1,$v1
992 $code.=<<___ if ($flavour =~ /nubi/i);
993 $REG_L $t3,4*$SZREG($sp)
994 $REG_L $t2,3*$SZREG($sp)
995 $REG_L $t1,2*$SZREG($sp)
996 $REG_L $t0,1*$SZREG($sp)
997 $REG_L $gp,0*$SZREG($sp)
998 $PTR_ADD $sp,6*$SZREG
1000 $code.=<<___;
1001 jr $ra
1002 move $a0,$v0
1003 .end bn_div_words_internal
1005 undef $HH; undef $QT; undef $DH;
1007 ($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1008 ($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1010 ($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1011 ($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1013 ($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1015 $code.=<<___;
1017 .align 5
1018 .globl bn_mul_comba8
1019 .ent bn_mul_comba8
1020 bn_mul_comba8:
1021 .set noreorder
1023 $code.=<<___ if ($flavour =~ /nubi/i);
1024 .frame $sp,12*$SZREG,$ra
1025 .mask 0x803ff008,-$SZREG
1026 $PTR_SUB $sp,12*$SZREG
1027 $REG_S $ra,11*$SZREG($sp)
1028 $REG_S $s5,10*$SZREG($sp)
1029 $REG_S $s4,9*$SZREG($sp)
1030 $REG_S $s3,8*$SZREG($sp)
1031 $REG_S $s2,7*$SZREG($sp)
1032 $REG_S $s1,6*$SZREG($sp)
1033 $REG_S $s0,5*$SZREG($sp)
1034 $REG_S $t3,4*$SZREG($sp)
1035 $REG_S $t2,3*$SZREG($sp)
1036 $REG_S $t1,2*$SZREG($sp)
1037 $REG_S $t0,1*$SZREG($sp)
1038 $REG_S $gp,0*$SZREG($sp)
1040 $code.=<<___ if ($flavour !~ /nubi/i);
1041 .frame $sp,6*$SZREG,$ra
1042 .mask 0x003f0000,-$SZREG
1043 $PTR_SUB $sp,6*$SZREG
1044 $REG_S $s5,5*$SZREG($sp)
1045 $REG_S $s4,4*$SZREG($sp)
1046 $REG_S $s3,3*$SZREG($sp)
1047 $REG_S $s2,2*$SZREG($sp)
1048 $REG_S $s1,1*$SZREG($sp)
1049 $REG_S $s0,0*$SZREG($sp)
1051 $code.=<<___;
1053 .set reorder
1054 $LD $a_0,0($a1) # If compiled with -mips3 option on
1055 # R5000 box assembler barks on this
1056 # 1ine with "should not have mult/div
1057 # as last instruction in bb (R10K
1058 # bug)" warning. If anybody out there
1059 # has a clue about how to circumvent
1060 # this do send me a note.
1061 # <appro\@fy.chalmers.se>
1063 $LD $b_0,0($a2)
1064 $LD $a_1,$BNSZ($a1)
1065 $LD $a_2,2*$BNSZ($a1)
1066 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1067 $LD $a_3,3*$BNSZ($a1)
1068 $LD $b_1,$BNSZ($a2)
1069 $LD $b_2,2*$BNSZ($a2)
1070 $LD $b_3,3*$BNSZ($a2)
1071 mflo $c_1
1072 mfhi $c_2
1074 $LD $a_4,4*$BNSZ($a1)
1075 $LD $a_5,5*$BNSZ($a1)
1076 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1077 $LD $a_6,6*$BNSZ($a1)
1078 $LD $a_7,7*$BNSZ($a1)
1079 $LD $b_4,4*$BNSZ($a2)
1080 $LD $b_5,5*$BNSZ($a2)
1081 mflo $t_1
1082 mfhi $t_2
1083 $ADDU $c_2,$t_1
1084 sltu $at,$c_2,$t_1
1085 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1086 $ADDU $c_3,$t_2,$at
1087 $LD $b_6,6*$BNSZ($a2)
1088 $LD $b_7,7*$BNSZ($a2)
1089 $ST $c_1,0($a0) # r[0]=c1;
1090 mflo $t_1
1091 mfhi $t_2
1092 $ADDU $c_2,$t_1
1093 sltu $at,$c_2,$t_1
1094 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1095 $ADDU $t_2,$at
1096 $ADDU $c_3,$t_2
1097 sltu $c_1,$c_3,$t_2
1098 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1100 mflo $t_1
1101 mfhi $t_2
1102 $ADDU $c_3,$t_1
1103 sltu $at,$c_3,$t_1
1104 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1105 $ADDU $t_2,$at
1106 $ADDU $c_1,$t_2
1107 mflo $t_1
1108 mfhi $t_2
1109 $ADDU $c_3,$t_1
1110 sltu $at,$c_3,$t_1
1111 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1112 $ADDU $t_2,$at
1113 $ADDU $c_1,$t_2
1114 sltu $c_2,$c_1,$t_2
1115 mflo $t_1
1116 mfhi $t_2
1117 $ADDU $c_3,$t_1
1118 sltu $at,$c_3,$t_1
1119 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1120 $ADDU $t_2,$at
1121 $ADDU $c_1,$t_2
1122 sltu $at,$c_1,$t_2
1123 $ADDU $c_2,$at
1124 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1126 mflo $t_1
1127 mfhi $t_2
1128 $ADDU $c_1,$t_1
1129 sltu $at,$c_1,$t_1
1130 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1131 $ADDU $t_2,$at
1132 $ADDU $c_2,$t_2
1133 sltu $c_3,$c_2,$t_2
1134 mflo $t_1
1135 mfhi $t_2
1136 $ADDU $c_1,$t_1
1137 sltu $at,$c_1,$t_1
1138 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1139 $ADDU $t_2,$at
1140 $ADDU $c_2,$t_2
1141 sltu $at,$c_2,$t_2
1142 $ADDU $c_3,$at
1143 mflo $t_1
1144 mfhi $t_2
1145 $ADDU $c_1,$t_1
1146 sltu $at,$c_1,$t_1
1147 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1148 $ADDU $t_2,$at
1149 $ADDU $c_2,$t_2
1150 sltu $at,$c_2,$t_2
1151 $ADDU $c_3,$at
1152 mflo $t_1
1153 mfhi $t_2
1154 $ADDU $c_1,$t_1
1155 sltu $at,$c_1,$t_1
1156 $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1);
1157 $ADDU $t_2,$at
1158 $ADDU $c_2,$t_2
1159 sltu $at,$c_2,$t_2
1160 $ADDU $c_3,$at
1161 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1163 mflo $t_1
1164 mfhi $t_2
1165 $ADDU $c_2,$t_1
1166 sltu $at,$c_2,$t_1
1167 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1168 $ADDU $t_2,$at
1169 $ADDU $c_3,$t_2
1170 sltu $c_1,$c_3,$t_2
1171 mflo $t_1
1172 mfhi $t_2
1173 $ADDU $c_2,$t_1
1174 sltu $at,$c_2,$t_1
1175 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1176 $ADDU $t_2,$at
1177 $ADDU $c_3,$t_2
1178 sltu $at,$c_3,$t_2
1179 $ADDU $c_1,$at
1180 mflo $t_1
1181 mfhi $t_2
1182 $ADDU $c_2,$t_1
1183 sltu $at,$c_2,$t_1
1184 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1185 $ADDU $t_2,$at
1186 $ADDU $c_3,$t_2
1187 sltu $at,$c_3,$t_2
1188 $ADDU $c_1,$at
1189 mflo $t_1
1190 mfhi $t_2
1191 $ADDU $c_2,$t_1
1192 sltu $at,$c_2,$t_1
1193 $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1);
1194 $ADDU $t_2,$at
1195 $ADDU $c_3,$t_2
1196 sltu $at,$c_3,$t_2
1197 $ADDU $c_1,$at
1198 mflo $t_1
1199 mfhi $t_2
1200 $ADDU $c_2,$t_1
1201 sltu $at,$c_2,$t_1
1202 $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2);
1203 $ADDU $t_2,$at
1204 $ADDU $c_3,$t_2
1205 sltu $at,$c_3,$t_2
1206 $ADDU $c_1,$at
1207 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1209 mflo $t_1
1210 mfhi $t_2
1211 $ADDU $c_3,$t_1
1212 sltu $at,$c_3,$t_1
1213 $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2);
1214 $ADDU $t_2,$at
1215 $ADDU $c_1,$t_2
1216 sltu $c_2,$c_1,$t_2
1217 mflo $t_1
1218 mfhi $t_2
1219 $ADDU $c_3,$t_1
1220 sltu $at,$c_3,$t_1
1221 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1222 $ADDU $t_2,$at
1223 $ADDU $c_1,$t_2
1224 sltu $at,$c_1,$t_2
1225 $ADDU $c_2,$at
1226 mflo $t_1
1227 mfhi $t_2
1228 $ADDU $c_3,$t_1
1229 sltu $at,$c_3,$t_1
1230 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1231 $ADDU $t_2,$at
1232 $ADDU $c_1,$t_2
1233 sltu $at,$c_1,$t_2
1234 $ADDU $c_2,$at
1235 mflo $t_1
1236 mfhi $t_2
1237 $ADDU $c_3,$t_1
1238 sltu $at,$c_3,$t_1
1239 $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2);
1240 $ADDU $t_2,$at
1241 $ADDU $c_1,$t_2
1242 sltu $at,$c_1,$t_2
1243 $ADDU $c_2,$at
1244 mflo $t_1
1245 mfhi $t_2
1246 $ADDU $c_3,$t_1
1247 sltu $at,$c_3,$t_1
1248 $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2);
1249 $ADDU $t_2,$at
1250 $ADDU $c_1,$t_2
1251 sltu $at,$c_1,$t_2
1252 $ADDU $c_2,$at
1253 mflo $t_1
1254 mfhi $t_2
1255 $ADDU $c_3,$t_1
1256 sltu $at,$c_3,$t_1
1257 $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3);
1258 $ADDU $t_2,$at
1259 $ADDU $c_1,$t_2
1260 sltu $at,$c_1,$t_2
1261 $ADDU $c_2,$at
1262 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1264 mflo $t_1
1265 mfhi $t_2
1266 $ADDU $c_1,$t_1
1267 sltu $at,$c_1,$t_1
1268 $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3);
1269 $ADDU $t_2,$at
1270 $ADDU $c_2,$t_2
1271 sltu $c_3,$c_2,$t_2
1272 mflo $t_1
1273 mfhi $t_2
1274 $ADDU $c_1,$t_1
1275 sltu $at,$c_1,$t_1
1276 $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3);
1277 $ADDU $t_2,$at
1278 $ADDU $c_2,$t_2
1279 sltu $at,$c_2,$t_2
1280 $ADDU $c_3,$at
1281 mflo $t_1
1282 mfhi $t_2
1283 $ADDU $c_1,$t_1
1284 sltu $at,$c_1,$t_1
1285 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1286 $ADDU $t_2,$at
1287 $ADDU $c_2,$t_2
1288 sltu $at,$c_2,$t_2
1289 $ADDU $c_3,$at
1290 mflo $t_1
1291 mfhi $t_2
1292 $ADDU $c_1,$t_1
1293 sltu $at,$c_1,$t_1
1294 $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3);
1295 $ADDU $t_2,$at
1296 $ADDU $c_2,$t_2
1297 sltu $at,$c_2,$t_2
1298 $ADDU $c_3,$at
1299 mflo $t_1
1300 mfhi $t_2
1301 $ADDU $c_1,$t_1
1302 sltu $at,$c_1,$t_1
1303 $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3);
1304 $ADDU $t_2,$at
1305 $ADDU $c_2,$t_2
1306 sltu $at,$c_2,$t_2
1307 $ADDU $c_3,$at
1308 mflo $t_1
1309 mfhi $t_2
1310 $ADDU $c_1,$t_1
1311 sltu $at,$c_1,$t_1
1312 $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3);
1313 $ADDU $t_2,$at
1314 $ADDU $c_2,$t_2
1315 sltu $at,$c_2,$t_2
1316 $ADDU $c_3,$at
1317 mflo $t_1
1318 mfhi $t_2
1319 $ADDU $c_1,$t_1
1320 sltu $at,$c_1,$t_1
1321 $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1);
1322 $ADDU $t_2,$at
1323 $ADDU $c_2,$t_2
1324 sltu $at,$c_2,$t_2
1325 $ADDU $c_3,$at
1326 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1328 mflo $t_1
1329 mfhi $t_2
1330 $ADDU $c_2,$t_1
1331 sltu $at,$c_2,$t_1
1332 $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1);
1333 $ADDU $t_2,$at
1334 $ADDU $c_3,$t_2
1335 sltu $c_1,$c_3,$t_2
1336 mflo $t_1
1337 mfhi $t_2
1338 $ADDU $c_2,$t_1
1339 sltu $at,$c_2,$t_1
1340 $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1);
1341 $ADDU $t_2,$at
1342 $ADDU $c_3,$t_2
1343 sltu $at,$c_3,$t_2
1344 $ADDU $c_1,$at
1345 mflo $t_1
1346 mfhi $t_2
1347 $ADDU $c_2,$t_1
1348 sltu $at,$c_2,$t_1
1349 $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1);
1350 $ADDU $t_2,$at
1351 $ADDU $c_3,$t_2
1352 sltu $at,$c_3,$t_2
1353 $ADDU $c_1,$at
1354 mflo $t_1
1355 mfhi $t_2
1356 $ADDU $c_2,$t_1
1357 sltu $at,$c_2,$t_1
1358 $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1);
1359 $ADDU $t_2,$at
1360 $ADDU $c_3,$t_2
1361 sltu $at,$c_3,$t_2
1362 $ADDU $c_1,$at
1363 mflo $t_1
1364 mfhi $t_2
1365 $ADDU $c_2,$t_1
1366 sltu $at,$c_2,$t_1
1367 $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1);
1368 $ADDU $t_2,$at
1369 $ADDU $c_3,$t_2
1370 sltu $at,$c_3,$t_2
1371 $ADDU $c_1,$at
1372 mflo $t_1
1373 mfhi $t_2
1374 $ADDU $c_2,$t_1
1375 sltu $at,$c_2,$t_1
1376 $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1);
1377 $ADDU $t_2,$at
1378 $ADDU $c_3,$t_2
1379 sltu $at,$c_3,$t_2
1380 $ADDU $c_1,$at
1381 mflo $t_1
1382 mfhi $t_2
1383 $ADDU $c_2,$t_1
1384 sltu $at,$c_2,$t_1
1385 $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1);
1386 $ADDU $t_2,$at
1387 $ADDU $c_3,$t_2
1388 sltu $at,$c_3,$t_2
1389 $ADDU $c_1,$at
1390 mflo $t_1
1391 mfhi $t_2
1392 $ADDU $c_2,$t_1
1393 sltu $at,$c_2,$t_1
1394 $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2);
1395 $ADDU $t_2,$at
1396 $ADDU $c_3,$t_2
1397 sltu $at,$c_3,$t_2
1398 $ADDU $c_1,$at
1399 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1401 mflo $t_1
1402 mfhi $t_2
1403 $ADDU $c_3,$t_1
1404 sltu $at,$c_3,$t_1
1405 $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2);
1406 $ADDU $t_2,$at
1407 $ADDU $c_1,$t_2
1408 sltu $c_2,$c_1,$t_2
1409 mflo $t_1
1410 mfhi $t_2
1411 $ADDU $c_3,$t_1
1412 sltu $at,$c_3,$t_1
1413 $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2);
1414 $ADDU $t_2,$at
1415 $ADDU $c_1,$t_2
1416 sltu $at,$c_1,$t_2
1417 $ADDU $c_2,$at
1418 mflo $t_1
1419 mfhi $t_2
1420 $ADDU $c_3,$t_1
1421 sltu $at,$c_3,$t_1
1422 $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2);
1423 $ADDU $t_2,$at
1424 $ADDU $c_1,$t_2
1425 sltu $at,$c_1,$t_2
1426 $ADDU $c_2,$at
1427 mflo $t_1
1428 mfhi $t_2
1429 $ADDU $c_3,$t_1
1430 sltu $at,$c_3,$t_1
1431 $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2);
1432 $ADDU $t_2,$at
1433 $ADDU $c_1,$t_2
1434 sltu $at,$c_1,$t_2
1435 $ADDU $c_2,$at
1436 mflo $t_1
1437 mfhi $t_2
1438 $ADDU $c_3,$t_1
1439 sltu $at,$c_3,$t_1
1440 $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2);
1441 $ADDU $t_2,$at
1442 $ADDU $c_1,$t_2
1443 sltu $at,$c_1,$t_2
1444 $ADDU $c_2,$at
1445 mflo $t_1
1446 mfhi $t_2
1447 $ADDU $c_3,$t_1
1448 sltu $at,$c_3,$t_1
1449 $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2);
1450 $ADDU $t_2,$at
1451 $ADDU $c_1,$t_2
1452 sltu $at,$c_1,$t_2
1453 $ADDU $c_2,$at
1454 mflo $t_1
1455 mfhi $t_2
1456 $ADDU $c_3,$t_1
1457 sltu $at,$c_3,$t_1
1458 $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3);
1459 $ADDU $t_2,$at
1460 $ADDU $c_1,$t_2
1461 sltu $at,$c_1,$t_2
1462 $ADDU $c_2,$at
1463 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1465 mflo $t_1
1466 mfhi $t_2
1467 $ADDU $c_1,$t_1
1468 sltu $at,$c_1,$t_1
1469 $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3);
1470 $ADDU $t_2,$at
1471 $ADDU $c_2,$t_2
1472 sltu $c_3,$c_2,$t_2
1473 mflo $t_1
1474 mfhi $t_2
1475 $ADDU $c_1,$t_1
1476 sltu $at,$c_1,$t_1
1477 $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3);
1478 $ADDU $t_2,$at
1479 $ADDU $c_2,$t_2
1480 sltu $at,$c_2,$t_2
1481 $ADDU $c_3,$at
1482 mflo $t_1
1483 mfhi $t_2
1484 $ADDU $c_1,$t_1
1485 sltu $at,$c_1,$t_1
1486 $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3);
1487 $ADDU $t_2,$at
1488 $ADDU $c_2,$t_2
1489 sltu $at,$c_2,$t_2
1490 $ADDU $c_3,$at
1491 mflo $t_1
1492 mfhi $t_2
1493 $ADDU $c_1,$t_1
1494 sltu $at,$c_1,$t_1
1495 $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3);
1496 $ADDU $t_2,$at
1497 $ADDU $c_2,$t_2
1498 sltu $at,$c_2,$t_2
1499 $ADDU $c_3,$at
1500 mflo $t_1
1501 mfhi $t_2
1502 $ADDU $c_1,$t_1
1503 sltu $at,$c_1,$t_1
1504 $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3);
1505 $ADDU $t_2,$at
1506 $ADDU $c_2,$t_2
1507 sltu $at,$c_2,$t_2
1508 $ADDU $c_3,$at
1509 mflo $t_1
1510 mfhi $t_2
1511 $ADDU $c_1,$t_1
1512 sltu $at,$c_1,$t_1
1513 $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1);
1514 $ADDU $t_2,$at
1515 $ADDU $c_2,$t_2
1516 sltu $at,$c_2,$t_2
1517 $ADDU $c_3,$at
1518 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1520 mflo $t_1
1521 mfhi $t_2
1522 $ADDU $c_2,$t_1
1523 sltu $at,$c_2,$t_1
1524 $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1);
1525 $ADDU $t_2,$at
1526 $ADDU $c_3,$t_2
1527 sltu $c_1,$c_3,$t_2
1528 mflo $t_1
1529 mfhi $t_2
1530 $ADDU $c_2,$t_1
1531 sltu $at,$c_2,$t_1
1532 $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1);
1533 $ADDU $t_2,$at
1534 $ADDU $c_3,$t_2
1535 sltu $at,$c_3,$t_2
1536 $ADDU $c_1,$at
1537 mflo $t_1
1538 mfhi $t_2
1539 $ADDU $c_2,$t_1
1540 sltu $at,$c_2,$t_1
1541 $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1);
1542 $ADDU $t_2,$at
1543 $ADDU $c_3,$t_2
1544 sltu $at,$c_3,$t_2
1545 $ADDU $c_1,$at
1546 mflo $t_1
1547 mfhi $t_2
1548 $ADDU $c_2,$t_1
1549 sltu $at,$c_2,$t_1
1550 $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1);
1551 $ADDU $t_2,$at
1552 $ADDU $c_3,$t_2
1553 sltu $at,$c_3,$t_2
1554 $ADDU $c_1,$at
1555 mflo $t_1
1556 mfhi $t_2
1557 $ADDU $c_2,$t_1
1558 sltu $at,$c_2,$t_1
1559 $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2);
1560 $ADDU $t_2,$at
1561 $ADDU $c_3,$t_2
1562 sltu $at,$c_3,$t_2
1563 $ADDU $c_1,$at
1564 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1566 mflo $t_1
1567 mfhi $t_2
1568 $ADDU $c_3,$t_1
1569 sltu $at,$c_3,$t_1
1570 $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2);
1571 $ADDU $t_2,$at
1572 $ADDU $c_1,$t_2
1573 sltu $c_2,$c_1,$t_2
1574 mflo $t_1
1575 mfhi $t_2
1576 $ADDU $c_3,$t_1
1577 sltu $at,$c_3,$t_1
1578 $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2);
1579 $ADDU $t_2,$at
1580 $ADDU $c_1,$t_2
1581 sltu $at,$c_1,$t_2
1582 $ADDU $c_2,$at
1583 mflo $t_1
1584 mfhi $t_2
1585 $ADDU $c_3,$t_1
1586 sltu $at,$c_3,$t_1
1587 $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2);
1588 $ADDU $t_2,$at
1589 $ADDU $c_1,$t_2
1590 sltu $at,$c_1,$t_2
1591 $ADDU $c_2,$at
1592 mflo $t_1
1593 mfhi $t_2
1594 $ADDU $c_3,$t_1
1595 sltu $at,$c_3,$t_1
1596 $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3);
1597 $ADDU $t_2,$at
1598 $ADDU $c_1,$t_2
1599 sltu $at,$c_1,$t_2
1600 $ADDU $c_2,$at
1601 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1603 mflo $t_1
1604 mfhi $t_2
1605 $ADDU $c_1,$t_1
1606 sltu $at,$c_1,$t_1
1607 $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3);
1608 $ADDU $t_2,$at
1609 $ADDU $c_2,$t_2
1610 sltu $c_3,$c_2,$t_2
1611 mflo $t_1
1612 mfhi $t_2
1613 $ADDU $c_1,$t_1
1614 sltu $at,$c_1,$t_1
1615 $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3);
1616 $ADDU $t_2,$at
1617 $ADDU $c_2,$t_2
1618 sltu $at,$c_2,$t_2
1619 $ADDU $c_3,$at
1620 mflo $t_1
1621 mfhi $t_2
1622 $ADDU $c_1,$t_1
1623 sltu $at,$c_1,$t_1
1624 $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1);
1625 $ADDU $t_2,$at
1626 $ADDU $c_2,$t_2
1627 sltu $at,$c_2,$t_2
1628 $ADDU $c_3,$at
1629 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1631 mflo $t_1
1632 mfhi $t_2
1633 $ADDU $c_2,$t_1
1634 sltu $at,$c_2,$t_1
1635 $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1);
1636 $ADDU $t_2,$at
1637 $ADDU $c_3,$t_2
1638 sltu $c_1,$c_3,$t_2
1639 mflo $t_1
1640 mfhi $t_2
1641 $ADDU $c_2,$t_1
1642 sltu $at,$c_2,$t_1
1643 $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2);
1644 $ADDU $t_2,$at
1645 $ADDU $c_3,$t_2
1646 sltu $at,$c_3,$t_2
1647 $ADDU $c_1,$at
1648 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1650 mflo $t_1
1651 mfhi $t_2
1652 $ADDU $c_3,$t_1
1653 sltu $at,$c_3,$t_1
1654 $ADDU $t_2,$at
1655 $ADDU $c_1,$t_2
1656 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1657 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1659 .set noreorder
1661 $code.=<<___ if ($flavour =~ /nubi/i);
1662 $REG_L $s5,10*$SZREG($sp)
1663 $REG_L $s4,9*$SZREG($sp)
1664 $REG_L $s3,8*$SZREG($sp)
1665 $REG_L $s2,7*$SZREG($sp)
1666 $REG_L $s1,6*$SZREG($sp)
1667 $REG_L $s0,5*$SZREG($sp)
1668 $REG_L $t3,4*$SZREG($sp)
1669 $REG_L $t2,3*$SZREG($sp)
1670 $REG_L $t1,2*$SZREG($sp)
1671 $REG_L $t0,1*$SZREG($sp)
1672 $REG_L $gp,0*$SZREG($sp)
1673 jr $ra
1674 $PTR_ADD $sp,12*$SZREG
1676 $code.=<<___ if ($flavour !~ /nubi/i);
1677 $REG_L $s5,5*$SZREG($sp)
1678 $REG_L $s4,4*$SZREG($sp)
1679 $REG_L $s3,3*$SZREG($sp)
1680 $REG_L $s2,2*$SZREG($sp)
1681 $REG_L $s1,1*$SZREG($sp)
1682 $REG_L $s0,0*$SZREG($sp)
1683 jr $ra
1684 $PTR_ADD $sp,6*$SZREG
1686 $code.=<<___;
1687 .end bn_mul_comba8
1689 .align 5
1690 .globl bn_mul_comba4
1691 .ent bn_mul_comba4
1692 bn_mul_comba4:
1694 $code.=<<___ if ($flavour =~ /nubi/i);
1695 .frame $sp,6*$SZREG,$ra
1696 .mask 0x8000f008,-$SZREG
1697 .set noreorder
1698 $PTR_SUB $sp,6*$SZREG
1699 $REG_S $ra,5*$SZREG($sp)
1700 $REG_S $t3,4*$SZREG($sp)
1701 $REG_S $t2,3*$SZREG($sp)
1702 $REG_S $t1,2*$SZREG($sp)
1703 $REG_S $t0,1*$SZREG($sp)
1704 $REG_S $gp,0*$SZREG($sp)
1706 $code.=<<___;
1707 .set reorder
1708 $LD $a_0,0($a1)
1709 $LD $b_0,0($a2)
1710 $LD $a_1,$BNSZ($a1)
1711 $LD $a_2,2*$BNSZ($a1)
1712 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1713 $LD $a_3,3*$BNSZ($a1)
1714 $LD $b_1,$BNSZ($a2)
1715 $LD $b_2,2*$BNSZ($a2)
1716 $LD $b_3,3*$BNSZ($a2)
1717 mflo $c_1
1718 mfhi $c_2
1719 $ST $c_1,0($a0)
1721 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1722 mflo $t_1
1723 mfhi $t_2
1724 $ADDU $c_2,$t_1
1725 sltu $at,$c_2,$t_1
1726 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1727 $ADDU $c_3,$t_2,$at
1728 mflo $t_1
1729 mfhi $t_2
1730 $ADDU $c_2,$t_1
1731 sltu $at,$c_2,$t_1
1732 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1733 $ADDU $t_2,$at
1734 $ADDU $c_3,$t_2
1735 sltu $c_1,$c_3,$t_2
1736 $ST $c_2,$BNSZ($a0)
1738 mflo $t_1
1739 mfhi $t_2
1740 $ADDU $c_3,$t_1
1741 sltu $at,$c_3,$t_1
1742 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1743 $ADDU $t_2,$at
1744 $ADDU $c_1,$t_2
1745 mflo $t_1
1746 mfhi $t_2
1747 $ADDU $c_3,$t_1
1748 sltu $at,$c_3,$t_1
1749 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1750 $ADDU $t_2,$at
1751 $ADDU $c_1,$t_2
1752 sltu $c_2,$c_1,$t_2
1753 mflo $t_1
1754 mfhi $t_2
1755 $ADDU $c_3,$t_1
1756 sltu $at,$c_3,$t_1
1757 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1758 $ADDU $t_2,$at
1759 $ADDU $c_1,$t_2
1760 sltu $at,$c_1,$t_2
1761 $ADDU $c_2,$at
1762 $ST $c_3,2*$BNSZ($a0)
1764 mflo $t_1
1765 mfhi $t_2
1766 $ADDU $c_1,$t_1
1767 sltu $at,$c_1,$t_1
1768 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1769 $ADDU $t_2,$at
1770 $ADDU $c_2,$t_2
1771 sltu $c_3,$c_2,$t_2
1772 mflo $t_1
1773 mfhi $t_2
1774 $ADDU $c_1,$t_1
1775 sltu $at,$c_1,$t_1
1776 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1777 $ADDU $t_2,$at
1778 $ADDU $c_2,$t_2
1779 sltu $at,$c_2,$t_2
1780 $ADDU $c_3,$at
1781 mflo $t_1
1782 mfhi $t_2
1783 $ADDU $c_1,$t_1
1784 sltu $at,$c_1,$t_1
1785 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1786 $ADDU $t_2,$at
1787 $ADDU $c_2,$t_2
1788 sltu $at,$c_2,$t_2
1789 $ADDU $c_3,$at
1790 mflo $t_1
1791 mfhi $t_2
1792 $ADDU $c_1,$t_1
1793 sltu $at,$c_1,$t_1
1794 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1795 $ADDU $t_2,$at
1796 $ADDU $c_2,$t_2
1797 sltu $at,$c_2,$t_2
1798 $ADDU $c_3,$at
1799 $ST $c_1,3*$BNSZ($a0)
1801 mflo $t_1
1802 mfhi $t_2
1803 $ADDU $c_2,$t_1
1804 sltu $at,$c_2,$t_1
1805 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1806 $ADDU $t_2,$at
1807 $ADDU $c_3,$t_2
1808 sltu $c_1,$c_3,$t_2
1809 mflo $t_1
1810 mfhi $t_2
1811 $ADDU $c_2,$t_1
1812 sltu $at,$c_2,$t_1
1813 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1814 $ADDU $t_2,$at
1815 $ADDU $c_3,$t_2
1816 sltu $at,$c_3,$t_2
1817 $ADDU $c_1,$at
1818 mflo $t_1
1819 mfhi $t_2
1820 $ADDU $c_2,$t_1
1821 sltu $at,$c_2,$t_1
1822 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1823 $ADDU $t_2,$at
1824 $ADDU $c_3,$t_2
1825 sltu $at,$c_3,$t_2
1826 $ADDU $c_1,$at
1827 $ST $c_2,4*$BNSZ($a0)
1829 mflo $t_1
1830 mfhi $t_2
1831 $ADDU $c_3,$t_1
1832 sltu $at,$c_3,$t_1
1833 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1834 $ADDU $t_2,$at
1835 $ADDU $c_1,$t_2
1836 sltu $c_2,$c_1,$t_2
1837 mflo $t_1
1838 mfhi $t_2
1839 $ADDU $c_3,$t_1
1840 sltu $at,$c_3,$t_1
1841 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1842 $ADDU $t_2,$at
1843 $ADDU $c_1,$t_2
1844 sltu $at,$c_1,$t_2
1845 $ADDU $c_2,$at
1846 $ST $c_3,5*$BNSZ($a0)
1848 mflo $t_1
1849 mfhi $t_2
1850 $ADDU $c_1,$t_1
1851 sltu $at,$c_1,$t_1
1852 $ADDU $t_2,$at
1853 $ADDU $c_2,$t_2
1854 $ST $c_1,6*$BNSZ($a0)
1855 $ST $c_2,7*$BNSZ($a0)
1857 .set noreorder
1859 $code.=<<___ if ($flavour =~ /nubi/i);
1860 $REG_L $t3,4*$SZREG($sp)
1861 $REG_L $t2,3*$SZREG($sp)
1862 $REG_L $t1,2*$SZREG($sp)
1863 $REG_L $t0,1*$SZREG($sp)
1864 $REG_L $gp,0*$SZREG($sp)
1865 $PTR_ADD $sp,6*$SZREG
1867 $code.=<<___;
1868 jr $ra
1870 .end bn_mul_comba4
1873 ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1875 sub add_c2 () {
1876 my ($hi,$lo,$c0,$c1,$c2,
1877 $warm, # !$warm denotes first call with specific sequence of
1878 # $c_[XYZ] when there is no Z-carry to accumulate yet;
1879 $an,$bn # these two are arguments for multiplication which
1880 # result is used in *next* step [which is why it's
1881 # commented as "forward multiplication" below];
1882 )=@_;
1883 $code.=<<___;
1884 mflo $lo
1885 mfhi $hi
1886 $ADDU $c0,$lo
1887 sltu $at,$c0,$lo
1888 $MULTU $an,$bn # forward multiplication
1889 $ADDU $c0,$lo
1890 $ADDU $at,$hi
1891 sltu $lo,$c0,$lo
1892 $ADDU $c1,$at
1893 $ADDU $hi,$lo
1895 $code.=<<___ if (!$warm);
1896 sltu $c2,$c1,$at
1897 $ADDU $c1,$hi
1898 sltu $hi,$c1,$hi
1899 $ADDU $c2,$hi
1901 $code.=<<___ if ($warm);
1902 sltu $at,$c1,$at
1903 $ADDU $c1,$hi
1904 $ADDU $c2,$at
1905 sltu $hi,$c1,$hi
1906 $ADDU $c2,$hi
1910 $code.=<<___;
1912 .align 5
1913 .globl bn_sqr_comba8
1914 .ent bn_sqr_comba8
1915 bn_sqr_comba8:
1917 $code.=<<___ if ($flavour =~ /nubi/i);
1918 .frame $sp,6*$SZREG,$ra
1919 .mask 0x8000f008,-$SZREG
1920 .set noreorder
1921 $PTR_SUB $sp,6*$SZREG
1922 $REG_S $ra,5*$SZREG($sp)
1923 $REG_S $t3,4*$SZREG($sp)
1924 $REG_S $t2,3*$SZREG($sp)
1925 $REG_S $t1,2*$SZREG($sp)
1926 $REG_S $t0,1*$SZREG($sp)
1927 $REG_S $gp,0*$SZREG($sp)
1929 $code.=<<___;
1930 .set reorder
1931 $LD $a_0,0($a1)
1932 $LD $a_1,$BNSZ($a1)
1933 $LD $a_2,2*$BNSZ($a1)
1934 $LD $a_3,3*$BNSZ($a1)
1936 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1937 $LD $a_4,4*$BNSZ($a1)
1938 $LD $a_5,5*$BNSZ($a1)
1939 $LD $a_6,6*$BNSZ($a1)
1940 $LD $a_7,7*$BNSZ($a1)
1941 mflo $c_1
1942 mfhi $c_2
1943 $ST $c_1,0($a0)
1945 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
1946 mflo $t_1
1947 mfhi $t_2
1948 slt $c_1,$t_2,$zero
1949 $SLL $t_2,1
1950 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
1951 slt $a2,$t_1,$zero
1952 $ADDU $t_2,$a2
1953 $SLL $t_1,1
1954 $ADDU $c_2,$t_1
1955 sltu $at,$c_2,$t_1
1956 $ADDU $c_3,$t_2,$at
1957 $ST $c_2,$BNSZ($a0)
1959 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1960 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
1961 $code.=<<___;
1962 mflo $t_1
1963 mfhi $t_2
1964 $ADDU $c_3,$t_1
1965 sltu $at,$c_3,$t_1
1966 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
1967 $ADDU $t_2,$at
1968 $ADDU $c_1,$t_2
1969 sltu $at,$c_1,$t_2
1970 $ADDU $c_2,$at
1971 $ST $c_3,2*$BNSZ($a0)
1973 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
1974 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
1975 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
1976 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
1977 $code.=<<___;
1978 $ST $c_1,3*$BNSZ($a0)
1980 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
1981 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
1982 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
1983 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
1984 $code.=<<___;
1985 mflo $t_1
1986 mfhi $t_2
1987 $ADDU $c_2,$t_1
1988 sltu $at,$c_2,$t_1
1989 $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2);
1990 $ADDU $t_2,$at
1991 $ADDU $c_3,$t_2
1992 sltu $at,$c_3,$t_2
1993 $ADDU $c_1,$at
1994 $ST $c_2,4*$BNSZ($a0)
1996 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1997 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
1998 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
1999 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
2000 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2001 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
2002 $code.=<<___;
2003 $ST $c_3,5*$BNSZ($a0)
2005 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2006 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
2007 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2008 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
2009 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2010 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2011 $code.=<<___;
2012 mflo $t_1
2013 mfhi $t_2
2014 $ADDU $c_1,$t_1
2015 sltu $at,$c_1,$t_1
2016 $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1);
2017 $ADDU $t_2,$at
2018 $ADDU $c_2,$t_2
2019 sltu $at,$c_2,$t_2
2020 $ADDU $c_3,$at
2021 $ST $c_1,6*$BNSZ($a0)
2023 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2024 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
2025 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2026 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
2027 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2028 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
2029 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2030 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
2031 $code.=<<___;
2032 $ST $c_2,7*$BNSZ($a0)
2034 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2035 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
2036 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2037 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
2038 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2039 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
2040 $code.=<<___;
2041 mflo $t_1
2042 mfhi $t_2
2043 $ADDU $c_3,$t_1
2044 sltu $at,$c_3,$t_1
2045 $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3);
2046 $ADDU $t_2,$at
2047 $ADDU $c_1,$t_2
2048 sltu $at,$c_1,$t_2
2049 $ADDU $c_2,$at
2050 $ST $c_3,8*$BNSZ($a0)
2052 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2053 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
2054 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2055 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
2056 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2057 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
2058 $code.=<<___;
2059 $ST $c_1,9*$BNSZ($a0)
2061 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2062 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
2063 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2064 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
2065 $code.=<<___;
2066 mflo $t_1
2067 mfhi $t_2
2068 $ADDU $c_2,$t_1
2069 sltu $at,$c_2,$t_1
2070 $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2);
2071 $ADDU $t_2,$at
2072 $ADDU $c_3,$t_2
2073 sltu $at,$c_3,$t_2
2074 $ADDU $c_1,$at
2075 $ST $c_2,10*$BNSZ($a0)
2077 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2078 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
2079 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2080 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
2081 $code.=<<___;
2082 $ST $c_3,11*$BNSZ($a0)
2084 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2085 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
2086 $code.=<<___;
2087 mflo $t_1
2088 mfhi $t_2
2089 $ADDU $c_1,$t_1
2090 sltu $at,$c_1,$t_1
2091 $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1);
2092 $ADDU $t_2,$at
2093 $ADDU $c_2,$t_2
2094 sltu $at,$c_2,$t_2
2095 $ADDU $c_3,$at
2096 $ST $c_1,12*$BNSZ($a0)
2098 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2099 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
2100 $code.=<<___;
2101 $ST $c_2,13*$BNSZ($a0)
2103 mflo $t_1
2104 mfhi $t_2
2105 $ADDU $c_3,$t_1
2106 sltu $at,$c_3,$t_1
2107 $ADDU $t_2,$at
2108 $ADDU $c_1,$t_2
2109 $ST $c_3,14*$BNSZ($a0)
2110 $ST $c_1,15*$BNSZ($a0)
2112 .set noreorder
2114 $code.=<<___ if ($flavour =~ /nubi/i);
2115 $REG_L $t3,4*$SZREG($sp)
2116 $REG_L $t2,3*$SZREG($sp)
2117 $REG_L $t1,2*$SZREG($sp)
2118 $REG_L $t0,1*$SZREG($sp)
2119 $REG_L $gp,0*$SZREG($sp)
2120 $PTR_ADD $sp,6*$SZREG
2122 $code.=<<___;
2123 jr $ra
2125 .end bn_sqr_comba8
2127 .align 5
2128 .globl bn_sqr_comba4
2129 .ent bn_sqr_comba4
2130 bn_sqr_comba4:
2132 $code.=<<___ if ($flavour =~ /nubi/i);
2133 .frame $sp,6*$SZREG,$ra
2134 .mask 0x8000f008,-$SZREG
2135 .set noreorder
2136 $PTR_SUB $sp,6*$SZREG
2137 $REG_S $ra,5*$SZREG($sp)
2138 $REG_S $t3,4*$SZREG($sp)
2139 $REG_S $t2,3*$SZREG($sp)
2140 $REG_S $t1,2*$SZREG($sp)
2141 $REG_S $t0,1*$SZREG($sp)
2142 $REG_S $gp,0*$SZREG($sp)
2144 $code.=<<___;
2145 .set reorder
2146 $LD $a_0,0($a1)
2147 $LD $a_1,$BNSZ($a1)
2148 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
2149 $LD $a_2,2*$BNSZ($a1)
2150 $LD $a_3,3*$BNSZ($a1)
2151 mflo $c_1
2152 mfhi $c_2
2153 $ST $c_1,0($a0)
2155 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
2156 mflo $t_1
2157 mfhi $t_2
2158 slt $c_1,$t_2,$zero
2159 $SLL $t_2,1
2160 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
2161 slt $a2,$t_1,$zero
2162 $ADDU $t_2,$a2
2163 $SLL $t_1,1
2164 $ADDU $c_2,$t_1
2165 sltu $at,$c_2,$t_1
2166 $ADDU $c_3,$t_2,$at
2167 $ST $c_2,$BNSZ($a0)
2169 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2170 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
2171 $code.=<<___;
2172 mflo $t_1
2173 mfhi $t_2
2174 $ADDU $c_3,$t_1
2175 sltu $at,$c_3,$t_1
2176 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
2177 $ADDU $t_2,$at
2178 $ADDU $c_1,$t_2
2179 sltu $at,$c_1,$t_2
2180 $ADDU $c_2,$at
2181 $ST $c_3,2*$BNSZ($a0)
2183 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2184 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
2185 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2186 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2187 $code.=<<___;
2188 $ST $c_1,3*$BNSZ($a0)
2190 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2191 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2192 $code.=<<___;
2193 mflo $t_1
2194 mfhi $t_2
2195 $ADDU $c_2,$t_1
2196 sltu $at,$c_2,$t_1
2197 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
2198 $ADDU $t_2,$at
2199 $ADDU $c_3,$t_2
2200 sltu $at,$c_3,$t_2
2201 $ADDU $c_1,$at
2202 $ST $c_2,4*$BNSZ($a0)
2204 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2205 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2206 $code.=<<___;
2207 $ST $c_3,5*$BNSZ($a0)
2209 mflo $t_1
2210 mfhi $t_2
2211 $ADDU $c_1,$t_1
2212 sltu $at,$c_1,$t_1
2213 $ADDU $t_2,$at
2214 $ADDU $c_2,$t_2
2215 $ST $c_1,6*$BNSZ($a0)
2216 $ST $c_2,7*$BNSZ($a0)
2218 .set noreorder
2220 $code.=<<___ if ($flavour =~ /nubi/i);
2221 $REG_L $t3,4*$SZREG($sp)
2222 $REG_L $t2,3*$SZREG($sp)
2223 $REG_L $t1,2*$SZREG($sp)
2224 $REG_L $t0,1*$SZREG($sp)
2225 $REG_L $gp,0*$SZREG($sp)
2226 $PTR_ADD $sp,6*$SZREG
2228 $code.=<<___;
2229 jr $ra
2231 .end bn_sqr_comba4
2233 print $code;
2234 close STDOUT;