1 dnl SPARC v9
32-bit mpn_submul_1
-- Multiply a limb vector with a limb
and
2 dnl subtract the result from a second limb vector.
4 dnl Copyright
1998, 2000, 2001, 2003 Free Software Foundation
, Inc.
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
34 C Algorithm: We use two floating-point multiplies per limb product, with the
35 C invariant v operand split into two 16-bit pieces, and the u operand split
36 C into 32-bit pieces. We convert the two 48-bit products and transfer them to
43 C Possible optimizations:
44 C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
45 C memory bandwidth limited
, this could save
1.5 cycles
/limb.
46 C
2. Unroll the inner
loop. Since we already use alternate temporary areas
,
47 C it is very straightforward to unroll
, using an exit branch midways.
48 C Unrolling would allow deeper scheduling which could improve speed for L2
50 C
3. For
mpn_mul_1: Use more alternating temp areas. The
std'es and ldx'es
51 C aren
't sufficiently apart-scheduled with just two temp areas.
52 C 4. Specialize for particular v values. If its upper 16 bits are zero, we
53 C could save many operations.
64 PROLOGUE
(mpn_submul_1
)
66 sethi
%hi
(0xffff), %g1
68 or %g1
, %lo
(0xffff), %g1
76 ld
[%sp+104], %f10 C zero f10
80 define
(`fanop
', `fitod %f18, %f0') C A quasi
nop running
in the FA pipe
82 add %sp, 160, %o5 C point
in scratch area
83 and %o5
, -32, %o5 C
align at
0 (mod 32) in scratch area
86 ld
[%o1
], %f11 C read up
[i
]
87 add %o1
, 4, %o1 C up
++
88 bne
,pt
%icc
, .L_two_or_more
97 ldx
[%o5
+16], %g2 C p16
98 ldx
[%o5
+24], %g1 C p0
99 lduw
[%o0
], %g5 C read rp
[i
]
106 ld
[%o1
], %f11 C read up
[i
]
109 add %o1
, 4, %o1 C up
++
110 bne
,pt
%icc
, .L_three_or_more
123 lduw
[%o0
], %g5 C read rp
[i
]
124 ldx
[%o5
+16], %g2 C p16
125 ldx
[%o5
+24], %g1 C p0
132 ld
[%o1
], %f11 C read up
[i
]
139 add %o1
, 4, %o1 C up
++
140 bne
,pt
%icc
, .L_four_or_more
150 ldx
[%o5
+16], %g2 C p16
152 ldx
[%o5
+24], %g1 C p0
155 lduw
[%o0
], %g5 C read rp
[i
]
162 ld
[%o1
], %f11 C read up
[i
]
169 add %o1
, 4, %o1 C up
++
170 bne
,pt
%icc
, .L_five_or_more
174 ldx
[%o5
+16], %g2 C p16
176 ldx
[%o5
+24], %g1 C p0
181 add %o1
, 4, %o1 C up
++
182 lduw
[%o0
], %g5 C read rp
[i
]
189 ld
[%o1
], %f11 C read up
[i
]
191 ldx
[%o5
+16], %g2 C p16
193 ldx
[%o5
+24], %g1 C p0
198 add %o1
, 4, %o1 C up
++
199 lduw
[%o0
], %g5 C read rp
[i
]
207 .
Loop: sub %g0
, %g3
, %g3
209 ld
[%o1
], %f11 C read up
[i
]
212 sllx
%g2
, 16, %g4 C
(p16
<< 16)
213 add %o0
, 4, %o0 C rp
++
214 ldx
[%o5
+0], %g2 C p16
217 srl
%g3
, 0, %g3 C zero most significant
32 bits
218 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
219 ldx
[%o5
+8], %g1 C p0
223 add %g3
, %g4
, %g4 C p
+= cy
228 sub %g5
, %g4
, %g4 C p
+= rp
[i
]
232 xor %o5
, 16, %o5 C alternate scratch variables
233 add %o1
, 4, %o1 C up
++
237 srlx
%g4
, 32, %g3 C new cy
238 lduw
[%o0
], %g5 C read rp
[i
]
243 .
L5: sub %g0
, %g3
, %g3
245 sllx
%g2
, 16, %g4 C
(p16
<< 16)
246 ldx
[%o5
+0], %g2 C p16
248 srl
%g3
, 0, %g3 C zero most significant
32 bits
249 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
250 ldx
[%o5
+8], %g1 C p0
251 add %g4
, %g3
, %g4 C p
+= cy
254 sub %g5
, %g4
, %g4 C p
+= rp
[i
]
259 srlx
%g4
, 32, %g3 C new cy
260 lduw
[%o0
+4], %g5 C read rp
[i
]
263 .
L4: fdtox
%f16
, %f14
264 sllx
%g2
, 16, %g4 C
(p16
<< 16)
265 ldx
[%o5
+0], %g2 C p16
267 srl
%g3
, 0, %g3 C zero most significant
32 bits
268 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
269 ldx
[%o5
+8], %g1 C p0
270 add %g3
, %g4
, %g4 C p
+= cy
272 sub %g5
, %g4
, %g4 C p
+= rp
[i
]
276 srlx
%g4
, 32, %g3 C new cy
277 lduw
[%o0
+8], %g5 C read rp
[i
]
280 .
L3: sllx
%g2
, 16, %g4 C
(p16
<< 16)
281 ldx
[%o5
+0], %g2 C p16
282 srl
%g3
, 0, %g3 C zero most significant
32 bits
283 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
284 ldx
[%o5
+8], %g1 C p0
285 add %g3
, %g4
, %g4 C p
+= cy
286 sub %g5
, %g4
, %g4 C p
+= rp
[i
]
289 srlx
%g4
, 32, %g3 C new cy
290 lduw
[%o0
+12], %g5 C read rp
[i
]
293 .
L2: sllx
%g2
, 16, %g4 C
(p16
<< 16)
294 ldx
[%o5
+0], %g2 C p16
295 srl
%g3
, 0, %g3 C zero most significant
32 bits
296 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
297 ldx
[%o5
+8], %g1 C p0
298 add %g3
, %g4
, %g4 C p
+= cy
299 sub %g5
, %g4
, %g4 C p
+= rp
[i
]
301 srlx
%g4
, 32, %g3 C new cy
302 lduw
[%o0
+16], %g5 C read rp
[i
]
305 .
L1: sllx
%g2
, 16, %g4 C
(p16
<< 16)
306 srl
%g3
, 0, %g3 C zero most significant
32 bits
307 add %g1
, %g4
, %g4 C p
= p0
+ (p16
<< 16)
308 add %g3
, %g4
, %g4 C p
+= cy
309 sub %g5
, %g4
, %g4 C p
+= rp
[i
]
311 srlx
%g4
, 32, %g3 C new cy
316 EPILOGUE
(mpn_submul_1
)