beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / powerpc64 / mode64 / mul_basecase.asm
blob18731879e4bb09682e30e632c6ea9259babb12f8
1 dnl PowerPC-64 mpn_mul_basecase.
3 dnl Copyright 1999-2001, 2003-2006, 2008 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C POWER3/PPC630 6-18
35 C POWER4/PPC970 8
36 C POWER5 8
37 C POWER6 24
39 C INPUT PARAMETERS
40 define(`rp', `r3')
41 define(`up', `r4')
42 define(`un', `r5')
43 define(`vp', `r6')
44 define(`vn', `r7')
46 define(`v0', `r25')
47 define(`outer_rp', `r22')
48 define(`outer_up', `r23')
50 ASM_START()
51 PROLOGUE(mpn_mul_basecase)
53 C Special code for un <= 2, for efficiency of these important cases,
54 C and since it simplifies the default code.
55 cmpdi cr0, un, 2
56 bgt cr0, L(un_gt2)
57 cmpdi cr6, vn, 1
58 ld r7, 0(vp)
59 ld r5, 0(up)
60 mulld r8, r5, r7 C weight 0
61 mulhdu r9, r5, r7 C weight 1
62 std r8, 0(rp)
63 beq cr0, L(2x)
64 std r9, 8(rp)
65 blr
66 ALIGN(16)
67 L(2x): ld r0, 8(up)
68 mulld r8, r0, r7 C weight 1
69 mulhdu r10, r0, r7 C weight 2
70 addc r9, r9, r8
71 addze r10, r10
72 bne cr6, L(2x2)
73 std r9, 8(rp)
74 std r10, 16(rp)
75 blr
76 ALIGN(16)
77 L(2x2): ld r6, 8(vp)
78 nop
79 mulld r8, r5, r6 C weight 1
80 mulhdu r11, r5, r6 C weight 2
81 addc r9, r9, r8
82 std r9, 8(rp)
83 adde r11, r11, r10
84 mulld r12, r0, r6 C weight 2
85 mulhdu r0, r0, r6 C weight 3
86 addze r0, r0
87 addc r11, r11, r12
88 addze r0, r0
89 std r11, 16(rp)
90 std r0, 24(rp)
91 blr
93 L(un_gt2):
94 std r31, -8(r1)
95 std r30, -16(r1)
96 std r29, -24(r1)
97 std r28, -32(r1)
98 std r27, -40(r1)
99 std r26, -48(r1)
100 std r25, -56(r1)
101 std r24, -64(r1)
102 std r23, -72(r1)
103 std r22, -80(r1)
105 mr outer_rp, rp
106 mr outer_up, up
108 ld v0, 0(vp) C new v limb
109 addi vp, vp, 8
110 ld r26, 0(up)
112 rldicl. r0, un, 0,62 C r0 = n & 3, set cr0
113 cmpdi cr6, r0, 2
114 addi un, un, 1 C compute count...
115 srdi un, un, 2 C ...for ctr
116 mtctr un C copy inner loop count into ctr
117 beq cr0, L(b0)
118 blt cr6, L(b1)
119 beq cr6, L(b2)
122 ALIGN(16)
123 L(b3): mulld r0, r26, v0
124 mulhdu r12, r26, v0
125 addic r0, r0, 0
126 std r0, 0(rp)
127 ld r26, 8(up)
128 ld r27, 16(up)
129 bdz L(end_m_3)
131 ALIGN(16)
132 L(lo_m_3):
133 mulld r0, r26, v0
134 mulhdu r31, r26, v0
135 ld r26, 24(up)
137 mulld r24, r27, v0
138 mulhdu r8, r27, v0
139 ld r27, 32(up)
141 adde r0, r0, r12
142 adde r24, r24, r31
143 mulld r9, r26, v0
144 mulhdu r10, r26, v0
145 ld r26, 40(up)
147 mulld r11, r27, v0
148 mulhdu r12, r27, v0
149 ld r27, 48(up)
150 std r0, 8(rp)
151 adde r9, r9, r8
152 std r24, 16(rp)
153 adde r11, r11, r10
154 std r9, 24(rp)
155 addi up, up, 32
156 std r11, 32(rp)
157 addi rp, rp, 32
158 bdnz L(lo_m_3)
160 ALIGN(16)
161 L(end_m_3):
162 mulld r0, r26, v0
163 mulhdu r31, r26, v0
165 mulld r24, r27, v0
166 mulhdu r8, r27, v0
168 adde r0, r0, r12
169 adde r24, r24, r31
171 std r0, 8(rp)
172 std r24, 16(rp)
173 addze r8, r8
174 std r8, 24(rp)
175 addic. vn, vn, -1
176 beq L(ret)
178 ALIGN(16)
179 L(outer_lo_3):
180 mtctr un C copy inner loop count into ctr
181 addi rp, outer_rp, 8
182 mr up, outer_up
183 addi outer_rp, outer_rp, 8
184 ld v0, 0(vp) C new v limb
185 addi vp, vp, 8
186 ld r26, 0(up)
187 ld r28, 0(rp)
188 mulld r0, r26, v0
189 mulhdu r12, r26, v0
190 addc r0, r0, r28
191 std r0, 0(rp)
192 ld r26, 8(up)
193 ld r27, 16(up)
194 bdz L(end_3)
196 ALIGN(16) C registers dying
197 L(lo_3):
198 mulld r0, r26, v0 C
199 mulhdu r10, r26, v0 C 26
200 ld r26, 24(up) C
201 ld r28, 8(rp) C
202 mulld r24, r27, v0 C
203 mulhdu r8, r27, v0 C 27
204 ld r27, 32(up) C
205 ld r29, 16(rp) C
206 adde r0, r0, r12 C 0 12
207 adde r24, r24, r10 C 24 10
208 mulld r9, r26, v0 C
209 mulhdu r10, r26, v0 C 26
210 ld r26, 40(up) C
211 ld r30, 24(rp) C
212 mulld r11, r27, v0 C
213 mulhdu r12, r27, v0 C 27
214 ld r27, 48(up) C
215 ld r31, 32(rp) C
216 adde r9, r9, r8 C 8 9
217 adde r11, r11, r10 C 10 11
218 addze r12, r12 C 12
219 addc r0, r0, r28 C 0 28
220 std r0, 8(rp) C 0
221 adde r24, r24, r29 C 7 29
222 std r24, 16(rp) C 7
223 adde r9, r9, r30 C 9 30
224 std r9, 24(rp) C 9
225 adde r11, r11, r31 C 11 31
226 std r11, 32(rp) C 11
227 addi up, up, 32 C
228 addi rp, rp, 32 C
229 bdnz L(lo_3) C
231 ALIGN(16)
232 L(end_3):
233 mulld r0, r26, v0
234 mulhdu r10, r26, v0
235 ld r28, 8(rp)
237 mulld r24, r27, v0
238 mulhdu r8, r27, v0
239 ld r29, 16(rp)
241 adde r0, r0, r12
242 adde r24, r24, r10
243 addze r8, r8
244 addc r0, r0, r28
245 std r0, 8(rp)
246 adde r24, r24, r29
247 std r24, 16(rp)
248 addze r8, r8
249 std r8, 24(rp)
251 addic. vn, vn, -1
252 bne L(outer_lo_3)
253 b L(ret)
256 ALIGN(16)
257 L(b0): ld r27, 8(up)
258 addi up, up, 8
259 mulld r0, r26, v0
260 mulhdu r10, r26, v0
261 mulld r24, r27, v0
262 mulhdu r8, r27, v0
263 addc r24, r24, r10
264 addze r12, r8
265 std r0, 0(rp)
266 std r24, 8(rp)
267 addi rp, rp, 8
268 ld r26, 8(up)
269 ld r27, 16(up)
270 bdz L(end_m_0)
272 ALIGN(16)
273 L(lo_m_0):
274 mulld r0, r26, v0
275 mulhdu r31, r26, v0
276 ld r26, 24(up)
278 mulld r24, r27, v0
279 mulhdu r8, r27, v0
280 ld r27, 32(up)
282 adde r0, r0, r12
283 adde r24, r24, r31
284 mulld r9, r26, v0
285 mulhdu r10, r26, v0
286 ld r26, 40(up)
288 mulld r11, r27, v0
289 mulhdu r12, r27, v0
290 ld r27, 48(up)
291 std r0, 8(rp)
292 adde r9, r9, r8
293 std r24, 16(rp)
294 adde r11, r11, r10
295 std r9, 24(rp)
296 addi up, up, 32
297 std r11, 32(rp)
298 addi rp, rp, 32
299 bdnz L(lo_m_0)
301 ALIGN(16)
302 L(end_m_0):
303 mulld r0, r26, v0
304 mulhdu r31, r26, v0
306 mulld r24, r27, v0
307 mulhdu r8, r27, v0
309 adde r0, r0, r12
310 adde r24, r24, r31
312 std r0, 8(rp)
313 addze r8, r8
314 std r24, 16(rp)
315 addic. vn, vn, -1
316 std r8, 24(rp)
318 beq L(ret)
320 ALIGN(16)
321 L(outer_lo_0):
322 mtctr un C copy inner loop count into ctr
323 addi rp, outer_rp, 16
324 addi up, outer_up, 8
325 addi outer_rp, outer_rp, 8
326 ld v0, 0(vp) C new v limb
327 addi vp, vp, 8
328 ld r26, -8(up)
329 ld r27, 0(up)
330 ld r28, -8(rp)
331 ld r29, 0(rp)
334 mulld r0, r26, v0
335 mulhdu r10, r26, v0
336 mulld r24, r27, v0
337 mulhdu r8, r27, v0
338 addc r24, r24, r10
339 addze r12, r8
340 addc r0, r0, r28
341 std r0, -8(rp)
342 adde r24, r24, r29
343 std r24, 0(rp)
344 ld r26, 8(up)
345 ld r27, 16(up)
346 bdz L(end_0)
348 ALIGN(16) C registers dying
349 L(lo_0):
350 mulld r0, r26, v0 C
351 mulhdu r10, r26, v0 C 26
352 ld r26, 24(up) C
353 ld r28, 8(rp) C
354 mulld r24, r27, v0 C
355 mulhdu r8, r27, v0 C 27
356 ld r27, 32(up) C
357 ld r29, 16(rp) C
358 adde r0, r0, r12 C 0 12
359 adde r24, r24, r10 C 24 10
360 mulld r9, r26, v0 C
361 mulhdu r10, r26, v0 C 26
362 ld r26, 40(up) C
363 ld r30, 24(rp) C
364 mulld r11, r27, v0 C
365 mulhdu r12, r27, v0 C 27
366 ld r27, 48(up) C
367 ld r31, 32(rp) C
368 adde r9, r9, r8 C 8 9
369 adde r11, r11, r10 C 10 11
370 addze r12, r12 C 12
371 addc r0, r0, r28 C 0 28
372 std r0, 8(rp) C 0
373 adde r24, r24, r29 C 7 29
374 std r24, 16(rp) C 7
375 adde r9, r9, r30 C 9 30
376 std r9, 24(rp) C 9
377 adde r11, r11, r31 C 11 31
378 std r11, 32(rp) C 11
379 addi up, up, 32 C
380 addi rp, rp, 32 C
381 bdnz L(lo_0) C
383 ALIGN(16)
384 L(end_0):
385 mulld r0, r26, v0
386 mulhdu r10, r26, v0
387 ld r28, 8(rp)
389 mulld r24, r27, v0
390 mulhdu r8, r27, v0
391 ld r29, 16(rp)
393 adde r0, r0, r12
394 adde r24, r24, r10
395 addze r8, r8
396 addic. vn, vn, -1
397 addc r0, r0, r28
398 std r0, 8(rp)
399 adde r24, r24, r29
400 std r24, 16(rp)
401 addze r8, r8
402 std r8, 24(rp)
403 bne L(outer_lo_0)
404 b L(ret)
407 ALIGN(16)
408 L(b1): ld r27, 8(up)
410 mulld r0, r26, v0
411 mulhdu r31, r26, v0
412 ld r26, 16(up)
413 mulld r24, r27, v0
414 mulhdu r8, r27, v0
415 mulld r9, r26, v0
416 mulhdu r10, r26, v0
417 addc r24, r24, r31
418 adde r9, r9, r8
419 addze r12, r10
420 std r0, 0(rp)
421 std r24, 8(rp)
422 std r9, 16(rp)
423 addi up, up, 16
424 addi rp, rp, 16
425 ld r26, 8(up)
426 ld r27, 16(up)
427 bdz L(end_m_1)
429 ALIGN(16)
430 L(lo_m_1):
431 mulld r0, r26, v0
432 mulhdu r31, r26, v0
433 ld r26, 24(up)
435 mulld r24, r27, v0
436 mulhdu r8, r27, v0
437 ld r27, 32(up)
439 adde r0, r0, r12
440 adde r24, r24, r31
441 mulld r9, r26, v0
442 mulhdu r10, r26, v0
443 ld r26, 40(up)
445 mulld r11, r27, v0
446 mulhdu r12, r27, v0
447 ld r27, 48(up)
448 std r0, 8(rp)
449 adde r9, r9, r8
450 std r24, 16(rp)
451 adde r11, r11, r10
452 std r9, 24(rp)
453 addi up, up, 32
454 std r11, 32(rp)
455 addi rp, rp, 32
456 bdnz L(lo_m_1)
458 ALIGN(16)
459 L(end_m_1):
460 mulld r0, r26, v0
461 mulhdu r31, r26, v0
463 mulld r24, r27, v0
464 mulhdu r8, r27, v0
466 adde r0, r0, r12
467 adde r24, r24, r31
469 std r0, 8(rp)
470 addze r8, r8
471 std r24, 16(rp)
472 addic. vn, vn, -1
473 std r8, 24(rp)
475 beq L(ret)
477 ALIGN(16)
478 L(outer_lo_1):
479 mtctr un C copy inner loop count into ctr
480 addi rp, outer_rp, 24
481 addi up, outer_up, 16
482 addi outer_rp, outer_rp, 8
483 ld v0, 0(vp) C new v limb
484 addi vp, vp, 8
485 ld r26, -16(up)
486 ld r27, -8(up)
487 mulld r0, r26, v0
488 mulhdu r31, r26, v0
489 ld r26, 0(up)
490 ld r28, -16(rp)
491 mulld r24, r27, v0
492 mulhdu r8, r27, v0
493 ld r29, -8(rp)
494 ld r30, 0(rp)
495 mulld r9, r26, v0
496 mulhdu r10, r26, v0
497 addc r24, r24, r31
498 adde r9, r9, r8
499 addze r12, r10
500 addc r0, r0, r28
501 std r0, -16(rp)
502 adde r24, r24, r29
503 std r24, -8(rp)
504 adde r9, r9, r30
505 std r9, 0(rp)
506 ld r26, 8(up)
507 ld r27, 16(up)
508 bdz L(end_1)
510 ALIGN(16) C registers dying
511 L(lo_1):
512 mulld r0, r26, v0 C
513 mulhdu r10, r26, v0 C 26
514 ld r26, 24(up) C
515 ld r28, 8(rp) C
516 mulld r24, r27, v0 C
517 mulhdu r8, r27, v0 C 27
518 ld r27, 32(up) C
519 ld r29, 16(rp) C
520 adde r0, r0, r12 C 0 12
521 adde r24, r24, r10 C 24 10
522 mulld r9, r26, v0 C
523 mulhdu r10, r26, v0 C 26
524 ld r26, 40(up) C
525 ld r30, 24(rp) C
526 mulld r11, r27, v0 C
527 mulhdu r12, r27, v0 C 27
528 ld r27, 48(up) C
529 ld r31, 32(rp) C
530 adde r9, r9, r8 C 8 9
531 adde r11, r11, r10 C 10 11
532 addze r12, r12 C 12
533 addc r0, r0, r28 C 0 28
534 std r0, 8(rp) C 0
535 adde r24, r24, r29 C 7 29
536 std r24, 16(rp) C 7
537 adde r9, r9, r30 C 9 30
538 std r9, 24(rp) C 9
539 adde r11, r11, r31 C 11 31
540 std r11, 32(rp) C 11
541 addi up, up, 32 C
542 addi rp, rp, 32 C
543 bdnz L(lo_1) C
545 ALIGN(16)
546 L(end_1):
547 mulld r0, r26, v0
548 mulhdu r10, r26, v0
549 ld r28, 8(rp)
551 mulld r24, r27, v0
552 mulhdu r8, r27, v0
553 ld r29, 16(rp)
555 adde r0, r0, r12
556 adde r24, r24, r10
557 addze r8, r8
558 addic. vn, vn, -1
559 addc r0, r0, r28
560 std r0, 8(rp)
561 adde r24, r24, r29
562 std r24, 16(rp)
563 addze r8, r8
564 std r8, 24(rp)
565 bne L(outer_lo_1)
566 b L(ret)
569 ALIGN(16)
570 L(b2): ld r27, 8(up)
571 addi up, up, -8
572 addi rp, rp, -8
573 li r12, 0
574 addic r12, r12, 0
576 ALIGN(16)
577 L(lo_m_2):
578 mulld r0, r26, v0
579 mulhdu r31, r26, v0
580 ld r26, 24(up)
582 mulld r24, r27, v0
583 mulhdu r8, r27, v0
584 ld r27, 32(up)
586 adde r0, r0, r12
587 adde r24, r24, r31
588 mulld r9, r26, v0
589 mulhdu r10, r26, v0
590 ld r26, 40(up)
592 mulld r11, r27, v0
593 mulhdu r12, r27, v0
594 ld r27, 48(up)
595 std r0, 8(rp)
596 adde r9, r9, r8
597 std r24, 16(rp)
598 adde r11, r11, r10
599 std r9, 24(rp)
600 addi up, up, 32
601 std r11, 32(rp)
603 addi rp, rp, 32
604 bdnz L(lo_m_2)
606 ALIGN(16)
607 L(end_m_2):
608 mulld r0, r26, v0
609 mulhdu r31, r26, v0
611 mulld r24, r27, v0
612 mulhdu r8, r27, v0
614 adde r0, r0, r12
615 adde r24, r24, r31
617 std r0, 8(rp)
618 addze r8, r8
619 std r24, 16(rp)
620 addic. vn, vn, -1
621 std r8, 24(rp)
623 beq L(ret)
625 ALIGN(16)
626 L(outer_lo_2):
627 mtctr un C copy inner loop count into ctr
628 addi rp, outer_rp, 0
629 addi up, outer_up, -8
630 addi outer_rp, outer_rp, 8
631 ld v0, 0(vp) C new v limb
632 addi vp, vp, 8
633 ld r26, 8(up)
634 ld r27, 16(up)
635 li r12, 0
636 addic r12, r12, 0
638 ALIGN(16) C registers dying
639 L(lo_2):
640 mulld r0, r26, v0 C
641 mulhdu r10, r26, v0 C 26
642 ld r26, 24(up) C
643 ld r28, 8(rp) C
644 mulld r24, r27, v0 C
645 mulhdu r8, r27, v0 C 27
646 ld r27, 32(up) C
647 ld r29, 16(rp) C
648 adde r0, r0, r12 C 0 12
649 adde r24, r24, r10 C 24 10
650 mulld r9, r26, v0 C
651 mulhdu r10, r26, v0 C 26
652 ld r26, 40(up) C
653 ld r30, 24(rp) C
654 mulld r11, r27, v0 C
655 mulhdu r12, r27, v0 C 27
656 ld r27, 48(up) C
657 ld r31, 32(rp) C
658 adde r9, r9, r8 C 8 9
659 adde r11, r11, r10 C 10 11
660 addze r12, r12 C 12
661 addc r0, r0, r28 C 0 28
662 std r0, 8(rp) C 0
663 adde r24, r24, r29 C 7 29
664 std r24, 16(rp) C 7
665 adde r9, r9, r30 C 9 30
666 std r9, 24(rp) C 9
667 adde r11, r11, r31 C 11 31
668 std r11, 32(rp) C 11
669 addi up, up, 32 C
670 addi rp, rp, 32 C
671 bdnz L(lo_2) C
673 ALIGN(16)
674 L(end_2):
675 mulld r0, r26, v0
676 mulhdu r10, r26, v0
677 ld r28, 8(rp)
679 mulld r24, r27, v0
680 mulhdu r8, r27, v0
681 ld r29, 16(rp)
683 adde r0, r0, r12
684 adde r24, r24, r10
685 addze r8, r8
686 addic. vn, vn, -1
687 addc r0, r0, r28
688 std r0, 8(rp)
689 adde r24, r24, r29
690 std r24, 16(rp)
691 addze r8, r8
692 std r8, 24(rp)
693 bne L(outer_lo_2)
694 b L(ret)
697 L(ret): ld r31, -8(r1)
698 ld r30, -16(r1)
699 ld r29, -24(r1)
700 ld r28, -32(r1)
701 ld r27, -40(r1)
702 ld r26, -48(r1)
703 ld r25, -56(r1)
704 ld r24, -64(r1)
705 ld r23, -72(r1)
706 ld r22, -80(r1)
708 EPILOGUE()