1 dnl PowerPC
-64 mpn_sqr_basecase.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright
1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation
,
8 dnl
This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
11 dnl it under the terms of
either:
13 dnl
* the GNU Lesser General
Public License as published by the Free
14 dnl Software Foundation
; either version 3 of the License, or (at your
15 dnl option
) any later version.
19 dnl
* the GNU General
Public License as published by the Free Software
20 dnl Foundation
; either version 2 of the License, or (at your option) any
23 dnl
or both
in parallel
, as here.
25 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
26 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
27 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
30 dnl You should have received copies of the GNU General
Public License
and the
31 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
32 dnl see
https://www.gnu.
org/licenses
/.
34 include(`..
/config.m4
')
44 C * This is very crude, cleanup!
45 C * Try to reduce the number of needed live registers.
46 C * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4. The
47 C cost will be more live registers.
48 C * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code
49 C size a lot and speed things up perhaps 25%.
50 C * Use computed goto in order to compress the code.
51 C * Implement a larger final corner.
52 C * Schedule callee-saves register saves into other insns. This could save
53 C about 5 cycles/call. (We cannot analogously optimise the restores, since
54 C the sqr_diag_addlsh1 loop has no wind-down code as currently written.)
55 C * Should the alternating std/adde sequences be split? Some pipelines handle
56 C adde poorly, and might sequentialise all these instructions.
57 C * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for
58 C adjacent integer multiply insns. Except for the multiply insns, the code
59 C was not carefully optimised for POWER6 or any other CPU.
60 C * Perform cross-jumping in sqr_diag_addlsh1's feed
-in code
, into the
loop.
67 define
(`rp_outer
', `r25')
68 define
(`up_outer
', `r21')
69 define
(`rp_saved
', `r22')
70 define
(`up_saved
', `r23')
71 define
(`n_saved
', `r24')
74 PROLOGUE
(mpn_sqr_basecase
)
79 mulld r8
, r5
, r5 C weight
0
80 mulhdu r9
, r5
, r5 C weight
1
85 L
(ge2
): bgt cr0
, L
(gt2
)
88 mulld r8
, r0
, r0 C u0
* u0
89 mulhdu r9
, r0
, r0 C u0
* u0
91 mulld r10
, r6
, r6 C u1
* u1
92 mulhdu r11
, r6
, r6 C u1
* u1
93 mulld r4
, r6
, r0 C u1
* u0
94 mulhdu r5
, r6
, r0 C u1
* u0
108 L
(gt2
): std r31
, -8(r1
)
126 rldicl. r0
, n
, 0,62 C r0
= n
& 3, set cr0
128 addic r7
, n
, 2 C compute count...
129 srdi r7
, r7
, 2 C ...for ctr
130 mtctr r7 C copy count
into ctr
139 li r12
, 0 C carry limb
143 L
(tm3
): mulld r0
, r9
, r6
167 L
(em3
): mulld r0
, r9
, r6
192 L
(tm0
): mulld r0
, r9
, r6
216 L
(em0
): mulld r0
, r9
, r6
227 b L
(outer_loop_ent_2
)
246 L
(tm1
): mulld r0
, r9
, r6
270 L
(em1
): mulld r0
, r9
, r6
281 b L
(outer_loop_ent_3
)
283 L
(b2
): addi r7
, r7
, -1 C FIXME
308 L
(tm2
): mulld r0
, r9
, r6
332 L
(em2
): mulld r0
, r9
, r6
343 b L
(outer_loop_ent_0
)
348 addi up_outer
, up_outer
, 8
349 addi rp_outer
, rp_outer
, 16
388 L
(ta1
): mulld r0
, r9
, r6
389 mulhdu r26
, r9
, r6 C
9
391 mulhdu r8
, r27
, r6 C
27
396 adde r0
, r0
, r12 C
0 12
397 adde r7
, r7
, r26 C
5 7
399 mulhdu r10
, r9
, r6 C
9
401 mulhdu r12
, r27
, r6 C
27
406 adde r26
, r26
, r8 C
8 5
407 adde r11
, r11
, r10 C
10 11
409 addc r0
, r0
, r28 C
0 28
411 adde r7
, r7
, r29 C
7 29
413 adde r26
, r26
, r30 C
5 30
415 adde r11
, r11
, r31 C
11 31
421 L
(ea1
): mulld r0
, r9
, r6
439 addi up_outer
, up_outer
, 8
440 addi rp_outer
, rp_outer
, 16
470 L
(ta0
): mulld r0
, r9
, r6
471 mulhdu r26
, r9
, r6 C
9
473 mulhdu r8
, r27
, r6 C
27
478 adde r0
, r0
, r12 C
0 12
479 adde r7
, r7
, r26 C
5 7
481 mulhdu r10
, r9
, r6 C
9
483 mulhdu r12
, r27
, r6 C
27
488 adde r26
, r26
, r8 C
8 5
489 adde r11
, r11
, r10 C
10 11
491 addc r0
, r0
, r28 C
0 28
493 adde r7
, r7
, r29 C
7 29
495 adde r26
, r26
, r30 C
5 30
497 adde r11
, r11
, r31 C
11 31
503 L
(ea0
): mulld r0
, r9
, r6
521 addi up_outer
, up_outer
, 8
522 addi rp_outer
, rp_outer
, 16
544 L
(ta3
): mulld r0
, r9
, r6
545 mulhdu r26
, r9
, r6 C
9
547 mulhdu r8
, r27
, r6 C
27
552 adde r0
, r0
, r12 C
0 12
553 adde r7
, r7
, r26 C
5 7
555 mulhdu r10
, r9
, r6 C
9
557 mulhdu r12
, r27
, r6 C
27
562 adde r26
, r26
, r8 C
8 5
563 adde r11
, r11
, r10 C
10 11
565 addc r0
, r0
, r28 C
0 28
567 adde r7
, r7
, r29 C
7 29
569 adde r26
, r26
, r30 C
5 30
571 adde r11
, r11
, r31 C
11 31
577 L
(ea3
): mulld r0
, r9
, r6
596 addi up_outer
, up_outer
, 8
597 addi rp_outer
, rp_outer
, 16
606 li r12
, 0 C cy_limb
= 0
614 L
(ta2
): mulld r0
, r9
, r6
615 mulhdu r26
, r9
, r6 C
9
617 mulhdu r8
, r27
, r6 C
27
622 adde r0
, r0
, r12 C
0 12
623 adde r7
, r7
, r26 C
5 7
625 mulhdu r10
, r9
, r6 C
9
627 mulhdu r12
, r27
, r6 C
27
632 adde r26
, r26
, r8 C
8 5
633 adde r11
, r11
, r10 C
10 11
635 addc r0
, r0
, r28 C
0 28
637 adde r7
, r7
, r29 C
7 29
639 adde r26
, r26
, r30 C
5 30
641 adde r11
, r11
, r31 C
11 31
647 L
(ea2
): mulld r0
, r9
, r6
676 define
(`rp
', `rp_saved')
679 define
(`climb
', `r0')
685 rldicl. r0
, n
, 0,62 C r0
= n
& 3, set cr0
687 addi n
, n
, 2 C compute count...
688 srdi n
, n
, 2 C ...for ctr
689 mtctr n C put
loop count
into ctr
795 addic rp
, rp
, 8 C clear carry as side
-effect
848 L
(end): addze climb
, climb