1 dnl SPARC v9
64-bit mpn_addmul_1
-- Multiply a limb vector with a limb
and add
2 dnl the result to a second limb vector.
4 dnl Copyright
1998, 2000-2004 Free Software Foundation
, Inc.
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
38 C Algorithm: We use eight floating-point multiplies per limb product, with the
39 C invariant v operand split into four 16-bit pieces, and the up operand split
40 C into 32-bit pieces. We sum pairs of 48-bit partial products using
41 C floating-point add, then convert the four 49-bit product-sums and transfer
42 C them to the integer unit.
44 C Possible optimizations:
45 C 0. Rewrite to use algorithm of mpn_addmul_2.
46 C 1. Align the stack area where we transfer the four 49-bit product-sums
47 C to a 32-byte boundary. That would minimize the cache collision.
48 C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
49 C be to align the area to map to the area immediately before up?)
50 C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the
51 C develop mpn_addmul_2. This would save many integer instructions.
52 C 3. Unrolling. Questionable if it is worth the code expansion, given that
53 C it could only save 1 cycle/limb.
54 C 4. Specialize for particular v values. If its upper 32 bits are zero, we
55 C could save many operations, in the FPU (fmuld), but more so in the IEU
56 C since we'll be summing
48-bit quantities
, which might be simpler.
57 C
5. Ideally
, we should schedule the f2
/f3
and f4
/f5 RAW further apart
, and
58 C the i00
,i16
,i32
,i48 RAW less apart. The latter apart
-scheduling should
59 C
not be greater than needed for L2 cache latency
, and also
not so great
60 C that i16 needs to be copied.
61 C
6. Avoid performing mem
+fa
+fm
in the same cycle
, at least
not when we want
62 C to get
high IEU bandwidth.
(12 of the
14 cycles will be free for
2 IEU
65 C Instruction classification
(as per UltraSPARC
-1/2 functional units
):
69 C
10 ISHIFT
+ 14 IADDLOG
71 C
55 insns totally
(plus one
mov insn that should be optimized
out)
73 C The
loop executes
56 instructions
in 14 cycles on UltraSPARC
-1/2, i.e we
74 C sustain the peak execution rate of
4 instructions
/cycle.
83 REGISTER
(%g2
,#scratch
)
84 REGISTER
(%g3
,#scratch
)
86 define
(`p00
', `%f8') define
(`p16
',`%f10') define
(`p32
',`%f12') define
(`p48
',`%f14')
87 define
(`r32
',`%f16') define
(`r48
',`%f18') define
(`r64
',`%f20') define
(`r80
',`%f22')
88 define
(`v00
',`%f24') define
(`v16
',`%f26') define
(`v32
',`%f28') define
(`v48
',`%f30')
89 define
(`u00
',`%f32') define
(`u32
', `%f34')
90 define
(`a00
',`%f36') define
(`a16
',`%f38') define
(`a32
',`%f40') define
(`a48
',`%f42')
93 define
(`i00
',`%l0') define
(`i16
',`%l1') define
(`i32
',`%l2') define
(`i48
',`%l3')
94 define
(`xffffffff
',`%l7')
97 PROLOGUE
(mpn_addmul_1
)
99 C Initialization.
(1) Split v operand
into four
16-bit chunks
and store them
100 C as IEEE double
in fp registers.
(2) Clear upper
32 bits of fp register pairs
101 C f2
and f4.
(3) Store masks
in registers aliased to `xffff
' and `xffffffff'.
105 srlx
%g4
, 48, xffff C store
mask in register `xffff
'
107 stx %g2, [%sp+2223+0]
110 stx %g3, [%sp+2223+8]
113 stx %g2, [%sp+2223+16]
115 stx %g3, [%sp+2223+24]
116 srlx %g4, 32, xffffffff C store mask in register `xffffffff'
127 ldd
[%sp+2223+0], v00
128 ldd
[%sp+2223+8], v16
129 ldd
[%sp+2223+16], v32
130 ldd
[%sp+2223+24], v48
131 ld
[%sp+2223+0],%f2 C zero f2
132 ld
[%sp+2223+0],%f4 C zero f4
133 ld
[%i5
+%i2
], %f3 C read
low 32 bits of up
[i
]
134 ld
[%i1
+%i2
], %f5 C read
high 32 bits of up
[i
]
140 C Start real work.
(We sneakingly read f3
and f5 above...
)
141 C The software pipeline is very deep
, requiring
4 feed
-in stages.
151 bnz
,pt
%xcc
, .L_two_or_more
155 fmuld u32
, v32
, r64 C FIXME
not urgent
159 fmuld u32
, v48
, r80 C FIXME
not urgent
163 std a00
, [%sp+2223+0]
164 std a16
, [%sp+2223+8]
165 std a32
, [%sp+2223+16]
166 std a48
, [%sp+2223+24]
170 ldx
[%i0
+%i2
], rlimb C read rp
[i
]
172 ldx
[%sp+2223+0], i00
173 ldx
[%sp+2223+8], i16
174 ldx
[%sp+2223+16], i32
175 ldx
[%sp+2223+24], i48
176 std a00
, [%sp+2223+0]
177 std a16
, [%sp+2223+8]
180 srlx rlimb
, 32, %g4 C HI
(rlimb
)
181 and rlimb
, xffffffff
, %g5 C LO
(rlimb
)
182 add i00
, %g5
, %g5 C i00
+ now
in g5
183 ldx
[%sp+2223+0], i00
184 srlx i16
, 48, %l4 C
(i16
>> 48)
186 ldx
[%sp+2223+8], i16
187 srlx i48
, 16, %l5 C
(i48
>> 16)
188 add i32
, %g4
, %g4 C i32
+ now
in g4
189 sllx i48
, 32, %l6 C
(i48
<< 32)
190 srlx
%g4
, 32, %o3 C
(i32
>> 32)
191 add %l5
, %l4
, %o1 C hi64
- in %o1
192 std a00
, [%sp+2223+0]
193 sllx
%g4
, 16, %o2 C
(i32
<< 16)
194 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
195 std a16
, [%sp+2223+8]
196 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
197 add %g2
, %o2
, %o2 C mi64
- in %o2
198 add %l6
, %o2
, %o2 C mi64
- in %o2
199 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
200 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
205 ld
[%i5
+%i2
], %f3 C read
low 32 bits of up
[i
]
206 fmuld u32
, v32
, r64 C FIXME
not urgent
208 ld
[%i1
+%i2
], %f5 C read
high 32 bits of up
[i
]
211 fmuld u32
, v48
, r80 C FIXME
not urgent
217 std a00
, [%sp+2223+0]
219 std a16
, [%sp+2223+8]
221 std a32
, [%sp+2223+16]
223 std a48
, [%sp+2223+24]
229 bnz
,pt
%xcc
, .L_three_or_more
233 fmuld u32
, v32
, r64 C FIXME
not urgent
236 ldx
[%i0
+%i2
], rlimb C read rp
[i
]
238 fmuld u32
, v48
, r80 C FIXME
not urgent
240 ldx
[%sp+2223+0], i00
242 ldx
[%sp+2223+8], i16
243 ldx
[%sp+2223+16], i32
244 ldx
[%sp+2223+24], i48
246 std a00
, [%sp+2223+0]
247 std a16
, [%sp+2223+8]
248 std a32
, [%sp+2223+16]
249 std a48
, [%sp+2223+24]
253 srlx rlimb
, 32, %g4 C HI
(rlimb
)
254 and rlimb
, xffffffff
, %g5 C LO
(rlimb
)
255 ldx
[%i0
+%i2
], rlimb C read rp
[i
]
256 add i00
, %g5
, %g5 C i00
+ now
in g5
258 ldx
[%sp+2223+0], i00
259 srlx i16
, 48, %l4 C
(i16
>> 48)
261 ldx
[%sp+2223+8], i16
262 srlx i48
, 16, %l5 C
(i48
>> 16)
263 add i32
, %g4
, %g4 C i32
+ now
in g4
264 ldx
[%sp+2223+16], i32
265 sllx i48
, 32, %l6 C
(i48
<< 32)
266 ldx
[%sp+2223+24], i48
267 srlx
%g4
, 32, %o3 C
(i32
>> 32)
268 add %l5
, %l4
, %o1 C hi64
- in %o1
269 std a00
, [%sp+2223+0]
270 sllx
%g4
, 16, %o2 C
(i32
<< 16)
271 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
272 std a16
, [%sp+2223+8]
273 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
274 add %g2
, %o2
, %o2 C mi64
- in %o2
275 add %l6
, %o2
, %o2 C mi64
- in %o2
276 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
277 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
282 ld
[%i5
+%i2
], %f3 C read
low 32 bits of up
[i
]
283 fmuld u32
, v32
, r64 C FIXME
not urgent
285 ld
[%i1
+%i2
], %f5 C read
high 32 bits of up
[i
]
287 ldx
[%i0
+%i2
], rlimb C read rp
[i
]
289 fmuld u32
, v48
, r80 C FIXME
not urgent
291 ldx
[%sp+2223+0], i00
293 ldx
[%sp+2223+8], i16
295 ldx
[%sp+2223+16], i32
297 ldx
[%sp+2223+24], i48
299 std a00
, [%sp+2223+0]
301 std a16
, [%sp+2223+8]
303 std a32
, [%sp+2223+16]
305 std a48
, [%sp+2223+24]
311 bnz
,pt
%xcc
, .L_four_or_more
315 fmuld u32
, v32
, r64 C FIXME
not urgent
318 srlx rlimb
, 32, %g4 C HI
(rlimb
)
319 and rlimb
, xffffffff
, %g5 C LO
(rlimb
)
320 ldx
[%i0
+%i2
], rlimb C read rp
[i
]
322 add i00
, %g5
, %g5 C i00
+ now
in g5
323 fmuld u32
, v48
, r80 C FIXME
not urgent
325 ldx
[%sp+2223+0], i00
327 srlx i16
, 48, %l4 C
(i16
>> 48)
329 ldx
[%sp+2223+8], i16
330 srlx i48
, 16, %l5 C
(i48
>> 16)
331 add i32
, %g4
, %g4 C i32
+ now
in g4
332 ldx
[%sp+2223+16], i32
333 sllx i48
, 32, %l6 C
(i48
<< 32)
334 ldx
[%sp+2223+24], i48
336 srlx
%g4
, 32, %o3 C
(i32
>> 32)
337 add %l5
, %l4
, %o1 C hi64
- in %o1
338 std a00
, [%sp+2223+0]
339 sllx
%g4
, 16, %o2 C
(i32
<< 16)
340 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
341 std a16
, [%sp+2223+8]
342 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
343 add %g2
, %o2
, %o2 C mi64
- in %o2
344 std a32
, [%sp+2223+16]
345 add %l6
, %o2
, %o2 C mi64
- in %o2
346 std a48
, [%sp+2223+24]
347 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
348 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
353 ld
[%i5
+%i2
], %f3 C read
low 32 bits of up
[i
]
354 fmuld u32
, v32
, r64 C FIXME
not urgent
356 ld
[%i1
+%i2
], %f5 C read
high 32 bits of up
[i
]
358 srlx rlimb
, 32, %g4 C HI
(rlimb
)
359 and rlimb
, xffffffff
, %g5 C LO
(rlimb
)
360 ldx
[%i0
+%i2
], rlimb C read rp
[i
]
362 add i00
, %g5
, %g5 C i00
+ now
in g5
363 fmuld u32
, v48
, r80 C FIXME
not urgent
365 ldx
[%sp+2223+0], i00
367 srlx i16
, 48, %l4 C
(i16
>> 48)
369 ldx
[%sp+2223+8], i16
371 srlx i48
, 16, %l5 C
(i48
>> 16)
372 add i32
, %g4
, %g4 C i32
+ now
in g4
373 ldx
[%sp+2223+16], i32
375 sllx i48
, 32, %l6 C
(i48
<< 32)
376 ldx
[%sp+2223+24], i48
378 srlx
%g4
, 32, %o3 C
(i32
>> 32)
379 add %l5
, %l4
, %o1 C hi64
- in %o1
380 std a00
, [%sp+2223+0]
382 sllx
%g4
, 16, %o2 C
(i32
<< 16)
383 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
384 std a16
, [%sp+2223+8]
386 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
387 add %g2
, %o2
, %o2 C mi64
- in %o2
388 std a32
, [%sp+2223+16]
390 add %l6
, %o2
, %o2 C mi64
- in %o2
391 std a48
, [%sp+2223+24]
394 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
397 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
409 srlx
%o4
, 16, %o5 C
(x
>> 16)
410 ld
[%i5
+%i2
], %f3 C read
low 32 bits of up
[i
]
411 fmuld u32
, v32
, r64 C FIXME
not urgent
414 add %o5
, %o2
, %o2 C mi64
in %o2
2nd ASSIGNMENT
415 and %o4
, xffff
, %o5 C
(x
& 0xffff)
416 ld
[%i1
+%i2
], %f5 C read
high 32 bits of up
[i
]
419 srlx rlimb
, 32, %g4 C HI
(rlimb
)
420 and rlimb
, xffffffff
, %g5 C LO
(rlimb
)
421 ldx
[%i0
+%i2
], rlimb C read rp
[i
]
424 srlx
%o2
, 48, %o7 C
(mi64
>> 48)
425 add i00
, %g5
, %g5 C i00
+ now
in g5
426 fmuld u32
, v48
, r80 C FIXME
not urgent
429 sllx
%o2
, 16, %i3 C
(mi64
<< 16)
430 add %o7
, %o1
, cy C new cy
431 ldx
[%sp+2223+0], i00
434 srlx i16
, 48, %l4 C
(i16
>> 48)
436 ldx
[%sp+2223+8], i16
439 srlx i48
, 16, %l5 C
(i48
>> 16)
440 add i32
, %g4
, %g4 C i32
+ now
in g4
441 ldx
[%sp+2223+16], i32
444 sllx i48
, 32, %l6 C
(i48
<< 32)
446 ldx
[%sp+2223+24], i48
449 srlx
%g4
, 32, %o3 C
(i32
>> 32)
450 add %l5
, %l4
, %o1 C hi64
- in %o1
451 std a00
, [%sp+2223+0]
454 sllx
%g4
, 16, %o2 C
(i32
<< 16)
455 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
456 std a16
, [%sp+2223+8]
459 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
460 add %g2
, %o2
, %o2 C mi64
- in %o2
461 std a32
, [%sp+2223+16]
464 add %l6
, %o2
, %o2 C mi64
- in %o2
465 std a48
, [%sp+2223+24]
469 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
474 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
481 srlx
%o4
, 16, %o5 C
(x
>> 16)
482 fmuld u32
, v32
, r64 C FIXME
not urgent
484 add %o5
, %o2
, %o2 C mi64
in %o2
2nd ASSIGNMENT
485 and %o4
, xffff
, %o5 C
(x
& 0xffff)
487 srlx rlimb
, 32, %g4 C HI
(rlimb
)
488 and rlimb
, xffffffff
, %g5 C LO
(rlimb
)
489 ldx
[%i0
+%i2
], rlimb C read rp
[i
]
491 srlx
%o2
, 48, %o7 C
(mi64
>> 48)
492 add i00
, %g5
, %g5 C i00
+ now
in g5
493 fmuld u32
, v48
, r80 C FIXME
not urgent
495 sllx
%o2
, 16, %i3 C
(mi64
<< 16)
496 add %o7
, %o1
, cy C new cy
497 ldx
[%sp+2223+0], i00
499 srlx i16
, 48, %l4 C
(i16
>> 48)
501 ldx
[%sp+2223+8], i16
502 srlx i48
, 16, %l5 C
(i48
>> 16)
503 add i32
, %g4
, %g4 C i32
+ now
in g4
504 ldx
[%sp+2223+16], i32
505 sllx i48
, 32, %l6 C
(i48
<< 32)
507 ldx
[%sp+2223+24], i48
509 srlx
%g4
, 32, %o3 C
(i32
>> 32)
510 add %l5
, %l4
, %o1 C hi64
- in %o1
511 std a00
, [%sp+2223+0]
512 sllx
%g4
, 16, %o2 C
(i32
<< 16)
513 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
514 std a16
, [%sp+2223+8]
515 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
516 add %g2
, %o2
, %o2 C mi64
- in %o2
517 std a32
, [%sp+2223+16]
518 add %l6
, %o2
, %o2 C mi64
- in %o2
519 std a48
, [%sp+2223+24]
520 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
522 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
525 srlx
%o4
, 16, %o5 C
(x
>> 16)
526 add %o5
, %o2
, %o2 C mi64
in %o2
2nd ASSIGNMENT
527 and %o4
, xffff
, %o5 C
(x
& 0xffff)
529 srlx rlimb
, 32, %g4 C HI
(rlimb
)
530 and rlimb
, xffffffff
, %g5 C LO
(rlimb
)
531 ldx
[%i0
+%i2
], rlimb C read rp
[i
]
532 srlx
%o2
, 48, %o7 C
(mi64
>> 48)
533 add i00
, %g5
, %g5 C i00
+ now
in g5
535 sllx
%o2
, 16, %i3 C
(mi64
<< 16)
536 add %o7
, %o1
, cy C new cy
537 ldx
[%sp+2223+0], i00
538 srlx i16
, 48, %l4 C
(i16
>> 48)
540 ldx
[%sp+2223+8], i16
541 srlx i48
, 16, %l5 C
(i48
>> 16)
542 add i32
, %g4
, %g4 C i32
+ now
in g4
543 ldx
[%sp+2223+16], i32
544 sllx i48
, 32, %l6 C
(i48
<< 32)
546 ldx
[%sp+2223+24], i48
547 srlx
%g4
, 32, %o3 C
(i32
>> 32)
548 add %l5
, %l4
, %o1 C hi64
- in %o1
549 std a00
, [%sp+2223+0]
550 sllx
%g4
, 16, %o2 C
(i32
<< 16)
551 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
552 std a16
, [%sp+2223+8]
553 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
554 add %g2
, %o2
, %o2 C mi64
- in %o2
555 add %l6
, %o2
, %o2 C mi64
- in %o2
556 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
558 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
561 srlx
%o4
, 16, %o5 C
(x
>> 16)
562 add %o5
, %o2
, %o2 C mi64
in %o2
2nd ASSIGNMENT
563 and %o4
, xffff
, %o5 C
(x
& 0xffff)
564 srlx rlimb
, 32, %g4 C HI
(rlimb
)
565 and rlimb
, xffffffff
, %g5 C LO
(rlimb
)
566 srlx
%o2
, 48, %o7 C
(mi64
>> 48)
567 add i00
, %g5
, %g5 C i00
+ now
in g5
568 sllx
%o2
, 16, %i3 C
(mi64
<< 16)
569 add %o7
, %o1
, cy C new cy
570 ldx
[%sp+2223+0], i00
571 srlx i16
, 48, %l4 C
(i16
>> 48)
573 ldx
[%sp+2223+8], i16
574 srlx i48
, 16, %l5 C
(i48
>> 16)
575 add i32
, %g4
, %g4 C i32
+ now
in g4
576 sllx i48
, 32, %l6 C
(i48
<< 32)
578 srlx
%g4
, 32, %o3 C
(i32
>> 32)
579 add %l5
, %l4
, %o1 C hi64
- in %o1
580 sllx
%g4
, 16, %o2 C
(i32
<< 16)
581 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
582 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
583 add %g2
, %o2
, %o2 C mi64
- in %o2
584 add %l6
, %o2
, %o2 C mi64
- in %o2
585 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
587 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
590 srlx
%o4
, 16, %o5 C
(x
>> 16)
591 add %o5
, %o2
, %o2 C mi64
in %o2
2nd ASSIGNMENT
592 and %o4
, xffff
, %o5 C
(x
& 0xffff)
593 srlx
%o2
, 48, %o7 C
(mi64
>> 48)
594 sllx
%o2
, 16, %i3 C
(mi64
<< 16)
595 add %o7
, %o1
, cy C new cy
606 EPILOGUE
(mpn_addmul_1
)