1 dnl SPARC v9 mpn_mul_4
and mpn_addmul_4 for T3
/T4
/T5.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright
2013 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
36 C cycles/limb cycles/limb
38 C UltraSPARC T3: 21.5 22.0
39 C UltraSPARC T4: 2.625 2.75
42 C The code is well-scheduled and relies on OoO very little. There is hope that
43 C this will run at around 2.5 and 2.75 c/l respectively, on T4.
72 ifdef(`OPERATION_mul_4',`
74 define
(`ADDX
', `addcc`'$1')
75 define(`func', `mpn_mul_4
')
77 ifdef
(`OPERATION_addmul_4
',`
79 define(`ADDX', `addxccc
($1,$2,$3)')
80 define(`func', `mpn_addmul_4
')
84 MULFUNC_PROLOGUE
(mpn_mul_4 mpn_addmul_4
)
87 REGISTER
(%g2
,#scratch
)
88 REGISTER
(%g3
,#scratch
)
92 ldx
[up
+ 0], u1 C load up
[0] early
93 andcc n
, 1, %g0 C is n odd
?
120 AM4
(` ldx
[rp2
+ n
], r0
')
128 L(evn): ldx [up1 + n], u0
129 AM4(` ldx [rp2 + n], r0')
137 L
(top
): addcc
%l0
, w0
, w0
138 mulx u0
, v0
, %l0 C w
0
140 mulx u0
, v1
, %l1 C w
1
142 mulx u0
, v2
, %l2 C w
2
144 mulx u0
, v3
, %l3 C w
3
147 AM4
(` addcc r0
, w0
, w0
')
150 umulxhi
(u0
, v0
, %l4
) C w
1
151 AM4
(` ldx
[rp1
+ n
], r0
')
153 umulxhi(u0, v1, %l5) C w 2
155 umulxhi(u0, v2, %l6) C w 3
157 umulxhi(u0, v3, %l7) C w 4
158 L(mid): addcc %l0, w0, w0
159 mulx u1, v0, %l0 C w 1
161 mulx u1, v1, %l1 C w 2
163 mulx u1, v2, %l2 C w 3
165 mulx u1, v3, %l3 C w 4
168 AM4(` addcc r0, w0, w0')
171 umulxhi(u1, v0, %l4) C w 2
172 AM4(` ldx [rp2 + n], r0')
174 umulxhi
(u1
, v1
, %l5
) C w
3
176 umulxhi
(u1
, v2
, %l6
) C w
4
178 umulxhi
(u1
, v3
, %l7
) C w
5
182 L
(end): addcc
%l0
, w0
, w0
191 AM4
(` addcc r0
, w0
, w0
')
195 AM4
(` ldx
[rp1
+ n
], r0
')
207 AM4(` addcc r0, w0, w0')