1 dnl SPARC v9
32-bit mpn_addmul_1
-- Multiply a limb vector with a limb
and add
2 dnl the result to a second limb vector.
4 dnl Copyright
1998, 2000, 2001, 2003 Free Software Foundation
, Inc.
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
34 C Algorithm: We use two floating-point multiplies per limb product, with the
35 C invariant v operand split into two 16-bit pieces, and the u operand split
36 C into 32-bit pieces. We convert the two 48-bit products and transfer them to
43 C Possible optimizations:
44 C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
45 C memory bandwidth limited
, this could save
1.5 cycles
/limb.
46 C
2. Unroll the inner
loop. Since we already use alternate temporary areas
,
47 C it is very straightforward to unroll
, using an exit branch midways.
48 C Unrolling would allow deeper scheduling which could improve speed for L2
50 C
3. For
mpn_mul_1: Use more alternating temp areas. The
std'es and ldx'es
51 C aren
't sufficiently apart-scheduled with just two temp areas.
52 C 4. Specialize for particular v values. If its upper 16 bits are zero, we
53 C could save many operations.
64 PROLOGUE
(mpn_addmul_1
)
66 sethi
%hi
(0xffff), %g1
68 or %g1
, %lo
(0xffff), %g1
76 ld
[%sp+104], %f10 C zero f10
80 define
(`fanop
', `fitod %f18, %f0') C A quasi
nop running
in the FA pipe
82 add %sp, 160, %o5 C point
in scratch area
83 and %o5
, -32, %o5 C
align at
0 (mod 32) in scratch area
86 ld
[%o1
], %f11 C read up
[i
]
87 add %o1
, 4, %o1 C up
++
88 bne
,pt
%icc
, .L_two_or_more
97 ldx
[%o5
+16], %g2 C p16
98 ldx
[%o5
+24], %g1 C p0
99 lduw
[%o0
], %g5 C read rp
[i
]
106 ld
[%o1
], %f11 C read up
[i
]
109 add %o1
, 4, %o1 C up
++
110 bne
,pt
%icc
, .L_three_or_more
123 lduw
[%o0
], %g5 C read rp
[i
]
124 ldx
[%o5
+16], %g2 C p16
125 ldx
[%o5
+24], %g1 C p0
132 ld
[%o1
], %f11 C read up
[i
]
139 add %o1
, 4, %o1 C up
++
140 bne
,pt
%icc
, .L_four_or_more
150 ldx
[%o5
+16], %g2 C p16
152 ldx
[%o5
+24], %g1 C p0
155 lduw
[%o0
], %g5 C read rp
[i
]
162 ld
[%o1
], %f11 C read up
[i
]
169 add %o1
, 4, %o1 C up
++
170 bne
,pt
%icc
, .L_five_or_more
174 ldx
[%o5
+16], %g2 C p16
176 ldx
[%o5
+24], %g1 C p0
181 add %o1
, 4, %o1 C up
++
182 lduw
[%o0
], %g5 C read rp
[i
]
189 ld
[%o1
], %f11 C read up
[i
]
191 ldx
[%o5
+16], %g2 C p16
193 ldx
[%o5
+24], %g1 C p0
198 add %o1
, 4, %o1 C up
++
199 lduw
[%o0
], %g5 C read rp
[i
]
209 ld
[%o1
], %f11 C read up
[i
]
212 sllx
%g2
, 16, %g4 C
(p16
<< 16)
213 add %o0
, 4, %o0 C rp
++
214 ldx
[%o5
+0], %g2 C p16
218 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
219 ldx
[%o5
+8], %g1 C p0
223 add %g3
, %g4
, %g4 C p
+= cy
228 add %g5
, %g4
, %g4 C p
+= rp
[i
]
232 xor %o5
, 16, %o5 C alternate scratch variables
233 add %o1
, 4, %o1 C up
++
237 srlx
%g4
, 32, %g3 C new cy
238 lduw
[%o0
], %g5 C read rp
[i
]
243 .
L5: fdtox
%f16
, %f14
244 sllx
%g2
, 16, %g4 C
(p16
<< 16)
245 ldx
[%o5
+0], %g2 C p16
247 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
248 ldx
[%o5
+8], %g1 C p0
249 add %g4
, %g3
, %g4 C p
+= cy
252 add %g5
, %g4
, %g4 C p
+= rp
[i
]
257 srlx
%g4
, 32, %g3 C new cy
258 lduw
[%o0
+4], %g5 C read rp
[i
]
260 .
L4: fdtox
%f16
, %f14
261 sllx
%g2
, 16, %g4 C
(p16
<< 16)
262 ldx
[%o5
+0], %g2 C p16
264 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
265 ldx
[%o5
+8], %g1 C p0
266 add %g3
, %g4
, %g4 C p
+= cy
268 add %g5
, %g4
, %g4 C p
+= rp
[i
]
272 srlx
%g4
, 32, %g3 C new cy
273 lduw
[%o0
+8], %g5 C read rp
[i
]
275 .
L3: sllx
%g2
, 16, %g4 C
(p16
<< 16)
276 ldx
[%o5
+0], %g2 C p16
277 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
278 ldx
[%o5
+8], %g1 C p0
279 add %g3
, %g4
, %g4 C p
+= cy
280 add %g5
, %g4
, %g4 C p
+= rp
[i
]
283 srlx
%g4
, 32, %g3 C new cy
284 lduw
[%o0
+12], %g5 C read rp
[i
]
286 .
L2: sllx
%g2
, 16, %g4 C
(p16
<< 16)
287 ldx
[%o5
+0], %g2 C p16
288 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
289 ldx
[%o5
+8], %g1 C p0
290 add %g3
, %g4
, %g4 C p
+= cy
291 add %g5
, %g4
, %g4 C p
+= rp
[i
]
293 srlx
%g4
, 32, %g3 C new cy
294 lduw
[%o0
+16], %g5 C read rp
[i
]
296 .
L1: sllx
%g2
, 16, %g4 C
(p16
<< 16)
297 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
298 add %g3
, %g4
, %g4 C p
+= cy
299 add %g5
, %g4
, %g4 C p
+= rp
[i
]
301 srlx
%g4
, 32, %g3 C new cy
306 EPILOGUE
(mpn_addmul_1
)