beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / powerpc64 / mode64 / p6 / aorsmul_1.asm
blobc572b917e225c23c7e5eabc50b3ddd0c8bb94250
1 dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6.
3 dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation,
4 dnl Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C mpn_addmul_1 mpn_submul_1
35 C cycles/limb cycles/limb
36 C POWER3/PPC630 ? ?
37 C POWER4/PPC970 ? ?
38 C POWER5 ? ?
39 C POWER6 12.25 12.8
40 C POWER7 ? ?
42 C TODO
43 C * Reduce register usage.
44 C * Schedule function entry code.
45 C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling
46 C would bring us to 9 c/l.
47 C * Handle n = 1 and perhaps n = 2 separately, without saving any registers.
49 C INPUT PARAMETERS
50 define(`rp', `r3')
51 define(`up', `r4')
52 define(`n', `r5')
53 define(`v0', `r6')
55 ifdef(`OPERATION_addmul_1',`
56 define(ADDSUBC, adde)
57 define(ADDSUB, addc)
58 define(func, mpn_addmul_1)
59 define(func_nc, mpn_addmul_1c) C FIXME: not really supported
60 define(AM, `$1')
61 define(SM, `')
62 define(CLRRSC, `addic $1, r0, 0')
64 ifdef(`OPERATION_submul_1',`
65 define(ADDSUBC, subfe)
66 define(ADDSUB, subfc)
67 define(func, mpn_submul_1)
68 define(func_nc, mpn_submul_1c) C FIXME: not really supported
69 define(AM, `')
70 define(SM, `$1')
71 define(CLRRSC, `subfc $1, r0, r0')
74 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
76 ASM_START()
77 PROLOGUE(func)
78 std r31, -8(r1)
79 std r30, -16(r1)
80 std r29, -24(r1)
81 std r28, -32(r1)
82 std r27, -40(r1)
84 rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
85 cmpdi cr6, r0, 2
86 addi n, n, 3 C compute count...
87 srdi n, n, 2 C ...for ctr
88 mtctr n C copy loop count into ctr
89 beq cr0, L(b0)
90 blt cr6, L(b1)
91 beq cr6, L(b2)
93 L(b3): ld r8, 0(up)
94 ld r7, 8(up)
95 ld r27, 16(up)
96 addi up, up, 16
97 addi rp, rp, 16
98 mulld r5, r8, v0
99 mulhdu r8, r8, v0
100 mulld r9, r7, v0
101 mulhdu r7, r7, v0
102 mulld r11, r27, v0
103 mulhdu r27, r27, v0
104 ld r29, -16(rp)
105 ld r30, -8(rp)
106 ld r31, 0(rp)
107 addc r9, r9, r8
108 adde r11, r11, r7
109 addze r12, r27
110 ADDSUB r5, r5, r29
111 b L(l3)
113 L(b2): ld r7, 0(up)
114 ld r27, 8(up)
115 addi up, up, 8
116 addi rp, rp, 8
117 mulld r9, r7, v0
118 mulhdu r7, r7, v0
119 mulld r11, r27, v0
120 mulhdu r27, r27, v0
121 ld r30, -8(rp)
122 ld r31, 0(rp)
123 addc r11, r11, r7
124 addze r12, r27
125 ADDSUB r9, r9, r30
126 b L(l2)
128 L(b1): ld r27, 0(up)
129 ld r31, 0(rp)
130 mulld r11, r27, v0
131 mulhdu r12, r27, v0
132 ADDSUB r11, r11, r31
133 b L(l1)
135 L(b0): addi up, up, -8
136 addi rp, rp, -8
137 CLRRSC( r12) C clear r12 and clr/set cy
139 ALIGN(32)
140 L(top):
141 SM(` subfe r11, r0, r0') C complement...
142 SM(` addic r11, r11, 1') C ...carry flag
143 ld r10, 8(up)
144 ld r8, 16(up)
145 ld r7, 24(up)
146 ld r27, 32(up)
147 addi up, up, 32
148 addi rp, rp, 32
149 mulld r0, r10, v0
150 mulhdu r10, r10, v0
151 mulld r5, r8, v0
152 mulhdu r8, r8, v0
153 mulld r9, r7, v0
154 mulhdu r7, r7, v0
155 mulld r11, r27, v0
156 mulhdu r27, r27, v0
157 ld r28, -24(rp)
158 adde r0, r0, r12
159 ld r29, -16(rp)
160 adde r5, r5, r10
161 ld r30, -8(rp)
162 ld r31, 0(rp)
163 adde r9, r9, r8
164 adde r11, r11, r7
165 addze r12, r27
166 ADDSUB r0, r0, r28
167 std r0, -24(rp)
168 ADDSUBC r5, r5, r29
169 L(l3): std r5, -16(rp)
170 ADDSUBC r9, r9, r30
171 L(l2): std r9, -8(rp)
172 ADDSUBC r11, r11, r31
173 L(l1): std r11, 0(rp)
174 bdnz L(top)
176 AM(` addze r3, r12')
177 SM(` subfe r11, r0, r0') C complement...
178 ld r31, -8(r1)
179 SM(` subf r3, r11, r12')
180 ld r30, -16(r1)
181 ld r29, -24(r1)
182 ld r28, -32(r1)
183 ld r27, -40(r1)
185 EPILOGUE()