beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / sparc64 / ultrasparct3 / aormul_4.asm
blob845f6d6d69210994223b93669976e6ba91a2949e
1 dnl SPARC v9 mpn_mul_4 and mpn_addmul_4 for T3/T4/T5.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
36 C cycles/limb cycles/limb
37 C mul_4 addmul_4
38 C UltraSPARC T3: 21.5 22.0
39 C UltraSPARC T4: 2.625 2.75
42 C The code is well-scheduled and relies on OoO very little. There is hope that
43 C this will run at around 2.5 and 2.75 c/l respectively, on T4.
45 define(`rp', `%i0')
46 define(`up', `%i1')
47 define(`n', `%i2')
48 define(`vp', `%i3')
50 define(`v0', `%g1')
51 define(`v1', `%o7')
52 define(`v2', `%g2')
53 define(`v3', `%i3')
55 define(`w0', `%o0')
56 define(`w1', `%o1')
57 define(`w2', `%o2')
58 define(`w3', `%o3')
59 define(`w4', `%o4')
61 define(`r0', `%o5')
63 define(`u0', `%i4')
64 define(`u1', `%i5')
66 define(`rp0', `rp')
67 define(`rp1', `%g3')
68 define(`rp2', `%g4')
69 define(`up0', `up')
70 define(`up1', `%g5')
72 ifdef(`OPERATION_mul_4',`
73 define(`AM4', `')
74 define(`ADDX', `addcc`'$1')
75 define(`func', `mpn_mul_4')
77 ifdef(`OPERATION_addmul_4',`
78 define(`AM4', `$1')
79 define(`ADDX', `addxccc($1,$2,$3)')
80 define(`func', `mpn_addmul_4')
84 MULFUNC_PROLOGUE(mpn_mul_4 mpn_addmul_4)
86 ASM_START()
87 REGISTER(%g2,#scratch)
88 REGISTER(%g3,#scratch)
89 PROLOGUE(func)
90 save %sp, -176, %sp
92 ldx [up + 0], u1 C load up[0] early
93 andcc n, 1, %g0 C is n odd?
94 ldx [vp + 0], v0
95 sllx n, 3, n
96 ldx [vp + 8], v1
97 add n, -28, n
98 ldx [vp + 16], v2
99 add rp, -16, rp
100 ldx [vp + 24], v3
101 add up, n, up0
102 add rp, n, rp0
103 add up0, 8, up1
104 add rp0, 8, rp1
105 add rp0, 16, rp2
106 mulx u1, v0, %l0
107 mov 0, w0
108 mulx u1, v1, %l1
109 mov 0, w1
110 mulx u1, v2, %l2
111 mov 0, w2
112 mulx u1, v3, %l3
113 mov 0, w3
115 be L(evn)
116 neg n, n
118 L(odd): mov u1, u0
119 ldx [up1 + n], u1
120 AM4(` ldx [rp2 + n], r0')
121 umulxhi(u0, v0, %l4)
122 umulxhi(u0, v1, %l5)
123 umulxhi(u0, v2, %l6)
124 umulxhi(u0, v3, %l7)
125 b L(mid)
126 add n, 8, n
128 L(evn): ldx [up1 + n], u0
129 AM4(` ldx [rp2 + n], r0')
130 umulxhi(u1, v0, %l4)
131 umulxhi(u1, v1, %l5)
132 umulxhi(u1, v2, %l6)
133 umulxhi(u1, v3, %l7)
134 add n, 16, n
136 ALIGN(16)
137 L(top): addcc %l0, w0, w0
138 mulx u0, v0, %l0 C w 0
139 addxccc(%l1, w1, w1)
140 mulx u0, v1, %l1 C w 1
141 addxccc(%l2, w2, w2)
142 mulx u0, v2, %l2 C w 2
143 addxccc(%l3, w3, w3)
144 mulx u0, v3, %l3 C w 3
145 ldx [up0 + n], u1
146 addxc( %g0, %g0, w4)
147 AM4(` addcc r0, w0, w0')
148 stx w0, [rp0 + n]
149 ADDX(` %l4, w1, w0')
150 umulxhi(u0, v0, %l4) C w 1
151 AM4(` ldx [rp1 + n], r0')
152 addxccc(%l5, w2, w1)
153 umulxhi(u0, v1, %l5) C w 2
154 addxccc(%l6, w3, w2)
155 umulxhi(u0, v2, %l6) C w 3
156 addxc( %l7, w4, w3)
157 umulxhi(u0, v3, %l7) C w 4
158 L(mid): addcc %l0, w0, w0
159 mulx u1, v0, %l0 C w 1
160 addxccc(%l1, w1, w1)
161 mulx u1, v1, %l1 C w 2
162 addxccc(%l2, w2, w2)
163 mulx u1, v2, %l2 C w 3
164 addxccc(%l3, w3, w3)
165 mulx u1, v3, %l3 C w 4
166 ldx [up1 + n], u0
167 addxc( %g0, %g0, w4)
168 AM4(` addcc r0, w0, w0')
169 stx w0, [rp1 + n]
170 ADDX(` %l4, w1, w0')
171 umulxhi(u1, v0, %l4) C w 2
172 AM4(` ldx [rp2 + n], r0')
173 addxccc(%l5, w2, w1)
174 umulxhi(u1, v1, %l5) C w 3
175 addxccc(%l6, w3, w2)
176 umulxhi(u1, v2, %l6) C w 4
177 addxc( %l7, w4, w3)
178 umulxhi(u1, v3, %l7) C w 5
179 brlz n, L(top)
180 add n, 16, n
182 L(end): addcc %l0, w0, w0
183 mulx u0, v0, %l0
184 addxccc(%l1, w1, w1)
185 mulx u0, v1, %l1
186 addxccc(%l2, w2, w2)
187 mulx u0, v2, %l2
188 addxccc(%l3, w3, w3)
189 mulx u0, v3, %l3
190 addxc( %g0, %g0, w4)
191 AM4(` addcc r0, w0, w0')
192 stx w0, [rp0 + n]
193 ADDX(` %l4, w1, w0')
194 umulxhi(u0, v0, %l4)
195 AM4(` ldx [rp1 + n], r0')
196 addxccc(%l5, w2, w1)
197 umulxhi(u0, v1, %l5)
198 addxccc(%l6, w3, w2)
199 umulxhi(u0, v2, %l6)
200 addxc( %l7, w4, w3)
201 umulxhi(u0, v3, %l7)
202 addcc %l0, w0, w0
203 addxccc(%l1, w1, w1)
204 addxccc(%l2, w2, w2)
205 addxccc(%l3, w3, w3)
206 addxc( %g0, %g0, w4)
207 AM4(` addcc r0, w0, w0')
208 stx w0, [rp1 + n]
209 ADDX(` %l4, w1, w0')
210 addxccc(%l5, w2, w1)
211 addxccc(%l6, w3, w2)
212 stx w0, [rp2 + n]
213 add n, 16, n
214 stx w1, [rp1 + n]
215 stx w2, [rp2 + n]
216 addxc( %l7, w4, %i0)
218 restore
219 EPILOGUE()