1 dnl SPARC v9 mpn_mul_2
and mpn_addmul_2 for T3
/T4
/T5.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright
2013 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
36 C cycles/limb cycles/limb
38 C UltraSPARC T3: 22.5 23.5
39 C UltraSPARC T4: 3.25 3.75
42 C The code is reasonably scheduled but also relies on OoO. There was hope that
43 C this could run at around 3.0 and 3.5 c/l respectively, on T4. Two cycles per
44 C iteration needs to be removed.
46 C We could almost use 2-way unrolling, but currently the wN registers live too
47 C long. By changing add x,w1,w1 to add x,w1,w0, i.e. migrate the values down-
48 C wards, 2-way unrolling should become possible. With n-indexed addressing it
49 C should run no slower.
51 C The rp loads to g1/g3 are very much over-scheduled. Presumably, they could
52 C be postponed a full way, and then just one register could be used.
68 ifdef(`OPERATION_mul_2',`
70 define
(`ADDX
', `addcc`'$1')
71 define(`func', `mpn_mul_2
')
73 ifdef
(`OPERATION_addmul_2
',`
75 define(`ADDX', `addxccc
($1,$2,$3)')
76 define(`func', `mpn_addmul_2
')
80 MULFUNC_PROLOGUE
(mpn_mul_2 mpn_addmul_2
)
83 REGISTER
(%g2
,#scratch
)
84 REGISTER
(%g3
,#scratch
)
88 ldx
[vp
+0], v0 C load v0
90 ldx
[vp
+8], v1 C load v1
101 AM2
(` ldx
[rp
+0], %g1
')
106 umulxhi(%g4, v1, %l7)
107 AM2(` ldx [rp+8], %g3')
114 AM2
(` ldx
[rp
+0], %g3
')
119 umulxhi(%g4, v1, %l5)
120 AM2(` ldx [rp+8], %g1')
128 AM2
(` ldx
[rp
+0], %g1
')
133 umulxhi(%g4, v1, %l7)
134 AM2(` ldx [rp+8], %g3')
141 AM2
(` ldx
[rp
+0], %g3
')
146 umulxhi(%g4, v1, %l5)
147 AM2(` ldx [rp+8], %g1')
152 L
(top
): mulx
%i4
, v0
, %l2 C
0->5
153 umulxhi
(%i4
, v0
, %l6
) C
0->5
154 ldx
[up
+0], %i5 C
1->6
155 AM2
(` addcc w3
, %g3
, w3
') C 1
157 ADDX(` %l1, w0, w0') C
2
158 addxccc
(%l5
, w1
, w1
) C
3
159 mulx
%i4
, v1
, %l3 C
3->9
160 umulxhi
(%i4
, v1
, %l7
) C
4->9
161 AM2
(` ldx
[rp
+0], %g3
') C 4
162 addcc %l2, w0, w0 C 5
163 addxccc(%l6, w1, w1) C 5
164 addxc( %g0, %g0, w2) C 6
165 L(lo1): mulx %i5, v0, %l0 C 6
166 umulxhi(%i5, v0, %l4) C 7
168 AM2(` addcc w0, %g1, w0') C
8
170 ADDX
(`
%l3
, w1
, w1
') C 9
171 addxccc(%l7, w2, w2) C 9
172 mulx %i5, v1, %l1 C 10
173 umulxhi(%i5, v1, %l5) C 10
174 AM2(` ldx [rp+8], %g1') C
11
175 addcc
%l0
, w1
, w1 C
11
176 addxccc
(%l4
, w2
, w2
) C
12
177 addxc
( %g0
, %g0
, w3
) C
12
178 L
(lo0
): mulx
%i4
, v0
, %l2 C
13
179 umulxhi
(%i4
, v0
, %l6
) C
13
180 ldx
[up
+16], %i5 C
14
181 AM2
(` addcc w1
, %g3
, w1
') C 14
183 ADDX(` %l1, w2, w2') C
15
184 addxccc
(%l5
, w3
, w3
) C
16
185 mulx
%i4
, v1
, %l3 C
16
186 umulxhi
(%i4
, v1
, %l7
) C
17
187 AM2
(` ldx
[rp
+16], %g3
') C 17
188 addcc %l2, w2, w2 C 18
189 addxccc(%l6, w3, w3) C 18
190 addxc( %g0, %g0, w0) C 19
191 L(lo3): mulx %i5, v0, %l0 C 19
192 umulxhi(%i5, v0, %l4) C 20
193 ldx [up+24], %i4 C 20
194 AM2(` addcc w2, %g1, w2') C
21
196 ADDX
(`
%l3
, w3
, w3
') C 22
197 addxccc(%l7, w0, w0) C 22
198 mulx %i5, v1, %l1 C 23
199 umulxhi(%i5, v1, %l5) C 23
200 AM2(` ldx [rp+24], %g1') C
24
201 addcc
%l0
, w3
, w3 C
24
202 addxccc
(%l4
, w0
, w0
) C
25
203 addxc
( %g0
, %g0
, w1
) C
25
209 L
(end): mulx
%i4
, v0
, %l2
210 umulxhi
(%i4
, v0
, %l6
)
211 AM2
(` addcc w3
, %g3
, w3
')
216 umulxhi
(%i4
, v1
, %l7
)
220 AM2
(` addcc w0
, %g1
, w0
')