beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / sparc64 / ultrasparct3 / aormul_2.asm
blobccc6a4408d599d4131c00d6892026d011f55d3b8
1 dnl SPARC v9 mpn_mul_2 and mpn_addmul_2 for T3/T4/T5.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
36 C cycles/limb cycles/limb
37 C mul_2 addmul_2
38 C UltraSPARC T3: 22.5 23.5
39 C UltraSPARC T4: 3.25 3.75
42 C The code is reasonably scheduled but also relies on OoO. There was hope that
43 C this could run at around 3.0 and 3.5 c/l respectively, on T4. Two cycles per
44 C iteration needs to be removed.
46 C We could almost use 2-way unrolling, but currently the wN registers live too
47 C long. By changing add x,w1,w1 to add x,w1,w0, i.e. migrate the values down-
48 C wards, 2-way unrolling should become possible. With n-indexed addressing it
49 C should run no slower.
51 C The rp loads to g1/g3 are very much over-scheduled. Presumably, they could
52 C be postponed a full way, and then just one register could be used.
54 C INPUT PARAMETERS
55 define(`rp', `%i0')
56 define(`up', `%i1')
57 define(`n', `%i2')
58 define(`vp', `%i3')
60 define(`v0', `%o0')
61 define(`v1', `%o1')
63 define(`w0', `%o2')
64 define(`w1', `%o3')
65 define(`w2', `%o4')
66 define(`w3', `%o5')
68 ifdef(`OPERATION_mul_2',`
69 define(`AM2', `')
70 define(`ADDX', `addcc`'$1')
71 define(`func', `mpn_mul_2')
73 ifdef(`OPERATION_addmul_2',`
74 define(`AM2', `$1')
75 define(`ADDX', `addxccc($1,$2,$3)')
76 define(`func', `mpn_addmul_2')
80 MULFUNC_PROLOGUE(mpn_mul_2 mpn_addmul_2)
82 ASM_START()
83 REGISTER(%g2,#scratch)
84 REGISTER(%g3,#scratch)
85 PROLOGUE(func)
86 save %sp, -176, %sp
88 ldx [vp+0], v0 C load v0
89 and n, 3, %g5
90 ldx [vp+8], v1 C load v1
91 add n, -6, n
92 ldx [up+0], %g4
93 brz %g5, L(b0)
94 cmp %g5, 2
95 bcs L(b1)
96 nop
97 be L(b2)
98 nop
100 L(b3):
101 AM2(` ldx [rp+0], %g1')
102 mulx %g4, v0, w2
103 umulxhi(%g4, v0, w3)
104 ldx [up+8], %i5
105 mulx %g4, v1, %l3
106 umulxhi(%g4, v1, %l7)
107 AM2(` ldx [rp+8], %g3')
108 add up, -8, up
109 add rp, -8, rp
110 b L(lo3)
111 mov 0, w0
113 L(b2):
114 AM2(` ldx [rp+0], %g3')
115 mulx %g4, v0, w3
116 umulxhi(%g4, v0, w0)
117 ldx [up+8], %i4
118 mulx %g4, v1, %l1
119 umulxhi(%g4, v1, %l5)
120 AM2(` ldx [rp+8], %g1')
121 add rp, 16, rp
122 brlz n, L(end)
123 mov 0, w1
124 ba L(top)
125 add up, 16, up
127 L(b1):
128 AM2(` ldx [rp+0], %g1')
129 mulx %g4, v0, w0
130 umulxhi(%g4, v0, w1)
131 ldx [up+8], %i5
132 mulx %g4, v1, %l3
133 umulxhi(%g4, v1, %l7)
134 AM2(` ldx [rp+8], %g3')
135 add up, 8, up
136 add rp, 8, rp
137 b L(lo1)
138 mov 0, w2
140 L(b0):
141 AM2(` ldx [rp+0], %g3')
142 mulx %g4, v0, w1
143 umulxhi(%g4, v0, w2)
144 ldx [up+8], %i4
145 mulx %g4, v1, %l1
146 umulxhi(%g4, v1, %l5)
147 AM2(` ldx [rp+8], %g1')
148 b L(lo0)
149 mov 0, w3
151 ALIGN(16) C cycle
152 L(top): mulx %i4, v0, %l2 C 0->5
153 umulxhi(%i4, v0, %l6) C 0->5
154 ldx [up+0], %i5 C 1->6
155 AM2(` addcc w3, %g3, w3') C 1
156 stx w3, [rp-16] C 2
157 ADDX(` %l1, w0, w0') C 2
158 addxccc(%l5, w1, w1) C 3
159 mulx %i4, v1, %l3 C 3->9
160 umulxhi(%i4, v1, %l7) C 4->9
161 AM2(` ldx [rp+0], %g3') C 4
162 addcc %l2, w0, w0 C 5
163 addxccc(%l6, w1, w1) C 5
164 addxc( %g0, %g0, w2) C 6
165 L(lo1): mulx %i5, v0, %l0 C 6
166 umulxhi(%i5, v0, %l4) C 7
167 ldx [up+8], %i4 C 7
168 AM2(` addcc w0, %g1, w0') C 8
169 stx w0, [rp-8] C 8
170 ADDX(` %l3, w1, w1') C 9
171 addxccc(%l7, w2, w2) C 9
172 mulx %i5, v1, %l1 C 10
173 umulxhi(%i5, v1, %l5) C 10
174 AM2(` ldx [rp+8], %g1') C 11
175 addcc %l0, w1, w1 C 11
176 addxccc(%l4, w2, w2) C 12
177 addxc( %g0, %g0, w3) C 12
178 L(lo0): mulx %i4, v0, %l2 C 13
179 umulxhi(%i4, v0, %l6) C 13
180 ldx [up+16], %i5 C 14
181 AM2(` addcc w1, %g3, w1') C 14
182 stx w1, [rp+0] C 15
183 ADDX(` %l1, w2, w2') C 15
184 addxccc(%l5, w3, w3) C 16
185 mulx %i4, v1, %l3 C 16
186 umulxhi(%i4, v1, %l7) C 17
187 AM2(` ldx [rp+16], %g3') C 17
188 addcc %l2, w2, w2 C 18
189 addxccc(%l6, w3, w3) C 18
190 addxc( %g0, %g0, w0) C 19
191 L(lo3): mulx %i5, v0, %l0 C 19
192 umulxhi(%i5, v0, %l4) C 20
193 ldx [up+24], %i4 C 20
194 AM2(` addcc w2, %g1, w2') C 21
195 stx w2, [rp+8] C 21
196 ADDX(` %l3, w3, w3') C 22
197 addxccc(%l7, w0, w0) C 22
198 mulx %i5, v1, %l1 C 23
199 umulxhi(%i5, v1, %l5) C 23
200 AM2(` ldx [rp+24], %g1') C 24
201 addcc %l0, w3, w3 C 24
202 addxccc(%l4, w0, w0) C 25
203 addxc( %g0, %g0, w1) C 25
204 add up, 32, up
205 add rp, 32, rp
206 brgz n, L(top)
207 add n, -4, n
209 L(end): mulx %i4, v0, %l2
210 umulxhi(%i4, v0, %l6)
211 AM2(` addcc w3, %g3, w3')
212 stx w3, [rp-16]
213 ADDX(` %l1, w0, w0')
214 addxccc(%l5, w1, w1)
215 mulx %i4, v1, %l3
216 umulxhi(%i4, v1, %l7)
217 addcc %l2, w0, w0
218 addxccc(%l6, w1, w1)
219 addxc( %g0, %g0, w2)
220 AM2(` addcc w0, %g1, w0')
221 stx w0, [rp-8]
222 ADDX(` %l3, w1, w1')
223 stx w1, [rp+0]
224 addxc(%l7, w2, %i0)
227 restore
228 EPILOGUE()