beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / powerpc64 / mode64 / mul_1.asm
blob27a8f8fb4d1524c62f8ac8cfeb155e69da74bb68
1 dnl PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
2 dnl the result in a second limb vector.
4 dnl Copyright 1999-2001, 2003-2006, 2010 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C cycles/limb
35 C POWER3/PPC630 6-18
36 C POWER4/PPC970 7.25? not updated for last file revision
37 C POWER5 7.25
38 C POWER6 14
39 C POWER7 2.9
41 C TODO
42 C * Try to reduce the number of needed live registers (at least r5 and r10
43 C could be combined)
44 C * Optimize feed-in code, for speed and size.
45 C * Clean up r12/r7 usage in feed-in code.
47 C INPUT PARAMETERS
48 define(`rp', `r3')
49 define(`up', `r4')
50 define(`n', `r5')
51 define(`vl', `r6')
53 ASM_START()
54 PROLOGUE(mpn_mul_1c)
55 std r27, -40(r1)
56 std r26, -48(r1)
57 mr r12, r7
58 b L(ent)
59 EPILOGUE()
60 PROLOGUE(mpn_mul_1)
61 std r27, -40(r1)
62 std r26, -48(r1)
63 li r12, 0 C cy_limb = 0
64 L(ent): ld r26, 0(up)
66 rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
67 cmpdi cr6, r0, 2
68 addic n, n, 3 C compute count...
69 srdi n, n, 2 C ...for ctr
70 mtctr n C copy count into ctr
71 beq cr0, L(b00)
72 blt cr6, L(b01)
73 beq cr6, L(b10)
75 L(b11): mr r7, r12
76 mulld r0, r26, r6
77 mulhdu r12, r26, r6
78 addi up, up, 8
79 addc r0, r0, r7
80 std r0, 0(rp)
81 addi rp, rp, 8
82 b L(fic)
84 L(b00): ld r27, 8(up)
85 addi up, up, 16
86 mulld r0, r26, r6
87 mulhdu r5, r26, r6
88 mulld r7, r27, r6
89 mulhdu r8, r27, r6
90 addc r0, r0, r12
91 adde r7, r7, r5
92 addze r12, r8
93 std r0, 0(rp)
94 std r7, 8(rp)
95 addi rp, rp, 16
96 b L(fic)
98 nop C alignment
99 L(b01): bdnz L(gt1)
100 mulld r0, r26, r6
101 mulhdu r8, r26, r6
102 addc r0, r0, r12
103 std r0, 0(rp)
104 b L(ret)
105 L(gt1): ld r27, 8(up)
107 mulld r0, r26, r6
108 mulhdu r5, r26, r6
109 ld r26, 16(up)
110 mulld r7, r27, r6
111 mulhdu r8, r27, r6
112 mulld r9, r26, r6
113 mulhdu r10, r26, r6
114 addc r0, r0, r12
115 adde r7, r7, r5
116 adde r9, r9, r8
117 addze r12, r10
118 std r0, 0(rp)
119 std r7, 8(rp)
120 std r9, 16(rp)
121 addi up, up, 24
122 addi rp, rp, 24
123 b L(fic)
126 L(fic): ld r26, 0(up)
127 L(b10): ld r27, 8(up)
128 addi up, up, 16
129 bdz L(end)
131 L(top): mulld r0, r26, r6
132 mulhdu r5, r26, r6
133 mulld r7, r27, r6
134 mulhdu r8, r27, r6
135 ld r26, 0(up)
136 ld r27, 8(up)
137 adde r0, r0, r12
138 adde r7, r7, r5
139 mulld r9, r26, r6
140 mulhdu r10, r26, r6
141 mulld r11, r27, r6
142 mulhdu r12, r27, r6
143 ld r26, 16(up)
144 ld r27, 24(up)
145 std r0, 0(rp)
146 adde r9, r9, r8
147 std r7, 8(rp)
148 adde r11, r11, r10
149 std r9, 16(rp)
150 addi up, up, 32
151 std r11, 24(rp)
153 addi rp, rp, 32
154 bdnz L(top)
156 L(end): mulld r0, r26, r6
157 mulhdu r5, r26, r6
158 mulld r7, r27, r6
159 mulhdu r8, r27, r6
160 adde r0, r0, r12
161 adde r7, r7, r5
162 std r0, 0(rp)
163 std r7, 8(rp)
164 L(ret): addze r3, r8
165 ld r27, -40(r1)
166 ld r26, -48(r1)
168 EPILOGUE()