1 dnl SPARC v9
32-bit mpn_mul_1
-- Multiply a limb vector with a limb
and store
2 dnl the result
in a second limb vector.
4 dnl Copyright
1998, 2000, 2001, 2003 Free Software Foundation
, Inc.
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
34 C Algorithm: We use two floating-point multiplies per limb product, with the
35 C invariant v operand split into two 16-bit pieces, and the u operand split
36 C into 32-bit pieces. We convert the two 48-bit products and transfer them to
43 C Possible optimizations:
44 C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
45 C memory bandwidth limited
, this could save
1.5 cycles
/limb.
46 C
2. Unroll the inner
loop. Since we already use alternate temporary areas
,
47 C it is very straightforward to unroll
, using an exit branch midways.
48 C Unrolling would allow deeper scheduling which could improve speed for L2
50 C
3. For
mpn_mul_1: Use more alternating temp areas. The
std'es and ldx'es
51 C aren
't sufficiently apart-scheduled with just two temp areas.
52 C 4. Specialize for particular v values. If its upper 16 bits are zero, we
53 C could save many operations.
66 sethi
%hi
(0xffff), %g1
68 or %g1
, %lo
(0xffff), %g1
76 ld
[%sp+104], %f10 C zero f10
80 define
(`fanop
', `fitod %f18, %f0') C A quasi
nop running
in the FA pipe
82 add %sp, 160, %o5 C point
in scratch area
83 and %o5
, -32, %o5 C
align at
0 (mod 32) in scratch area
86 ld
[%o1
], %f11 C read up
[i
]
87 add %o1
, 4, %o1 C up
++
88 bne
,pt
%icc
, .L_two_or_more
97 ldx
[%o5
+16], %g2 C p16
98 ldx
[%o5
+24], %g1 C p0
105 ld
[%o1
], %f11 C read up
[i
]
108 add %o1
, 4, %o1 C up
++
109 bne
,pt
%icc
, .L_three_or_more
122 ldx
[%o5
+16], %g2 C p16
123 ldx
[%o5
+24], %g1 C p0
130 ld
[%o1
], %f11 C read up
[i
]
137 add %o1
, 4, %o1 C up
++
138 bne
,pt
%icc
, .L_four_or_more
148 ldx
[%o5
+16], %g2 C p16
150 ldx
[%o5
+24], %g1 C p0
159 ld
[%o1
], %f11 C read up
[i
]
166 add %o1
, 4, %o1 C up
++
167 bne
,pt
%icc
, .L_five_or_more
171 ldx
[%o5
+16], %g2 C p16
173 ldx
[%o5
+24], %g1 C p0
178 add %o1
, 4, %o1 C up
++
185 ld
[%o1
], %f11 C read up
[i
]
187 ldx
[%o5
+16], %g2 C p16
189 ldx
[%o5
+24], %g1 C p0
194 add %o1
, 4, %o1 C up
++
204 ld
[%o1
], %f11 C read up
[i
]
207 sllx
%g2
, 16, %g4 C
(p16
<< 16)
208 add %o0
, 4, %o0 C rp
++
209 ldx
[%o5
+0], %g2 C p16
213 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
214 ldx
[%o5
+8], %g1 C p0
218 add %g3
, %g4
, %g4 C p
+= cy
222 srlx
%g4
, 32, %g3 C new cy
223 add %o1
, 4, %o1 C up
++
227 xor %o5
, 16, %o5 C alternate scratch variables
233 .
L5: fdtox
%f16
, %f14
234 sllx
%g2
, 16, %g4 C
(p16
<< 16)
235 ldx
[%o5
+0], %g2 C p16
237 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
238 ldx
[%o5
+8], %g1 C p0
239 add %g4
, %g3
, %g4 C p
+= cy
246 srlx
%g4
, 32, %g3 C new cy
248 .
L4: fdtox
%f16
, %f14
249 sllx
%g2
, 16, %g4 C
(p16
<< 16)
250 ldx
[%o5
+0], %g2 C p16
252 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
253 ldx
[%o5
+8], %g1 C p0
254 add %g3
, %g4
, %g4 C p
+= cy
259 srlx
%g4
, 32, %g3 C new cy
261 .
L3: sllx
%g2
, 16, %g4 C
(p16
<< 16)
262 ldx
[%o5
+0], %g2 C p16
263 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
264 ldx
[%o5
+8], %g1 C p0
265 add %g3
, %g4
, %g4 C p
+= cy
268 srlx
%g4
, 32, %g3 C new cy
270 .
L2: sllx
%g2
, 16, %g4 C
(p16
<< 16)
271 ldx
[%o5
+0], %g2 C p16
272 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
273 ldx
[%o5
+8], %g1 C p0
274 add %g3
, %g4
, %g4 C p
+= cy
276 srlx
%g4
, 32, %g3 C new cy
278 .
L1: sllx
%g2
, 16, %g4 C
(p16
<< 16)
279 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
280 add %g3
, %g4
, %g4 C p
+= cy
282 srlx
%g4
, 32, %g3 C new cy