1 dnl SPARC v9
64-bit mpn_mul_1
-- Multiply a limb vector with a limb
and store
2 dnl the result
in a second limb vector.
4 dnl Copyright
1998, 2000-2003 Free Software Foundation
, Inc.
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
38 C Algorithm: We use eight floating-point multiplies per limb product, with the
39 C invariant v operand split into four 16-bit pieces, and the s1 operand split
40 C into 32-bit pieces. We sum pairs of 48-bit partial products using
41 C floating-point add, then convert the four 49-bit product-sums and transfer
42 C them to the integer unit.
44 C Possible optimizations:
45 C 1. Align the stack area where we transfer the four 49-bit product-sums
46 C to a 32-byte boundary. That would minimize the cache collision.
47 C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
48 C be to align the area to map to the area immediately before s1?)
49 C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the
50 C develop mpn_addmul_2. This would save many integer instructions.
51 C 3. Unrolling. Questionable if it is worth the code expansion, given that
52 C it could only save 1 cycle/limb.
53 C 4. Specialize for particular v values. If its upper 32 bits are zero, we
54 C could save many operations, in the FPU (fmuld), but more so in the IEU
55 C since we'll be summing
48-bit quantities
, which might be simpler.
56 C
5. Ideally
, we should schedule the f2
/f3
and f4
/f5 RAW further apart
, and
57 C the i00
,i16
,i32
,i48 RAW less apart. The latter apart
-scheduling should
58 C
not be greater than needed for L2 cache latency
, and also
not so great
59 C that i16 needs to be copied.
60 C
6. Avoid performing mem
+fa
+fm
in the same cycle
, at least
not when we want
61 C to get
high IEU bandwidth.
(12 of the
14 cycles will be free for
2 IEU
64 C Instruction classification
(as per UltraSPARC
-1/2 functional units
):
68 C
9 ISHIFT
+ 10? IADDLOG
70 C
49 insns totally
(plus three
mov insns that should be optimized
out)
72 C The
loop executes
53 instructions
in 14 cycles on UltraSPARC
-1/2, i.e we
73 C sustain
3.79 instructions
/cycle.
82 REGISTER
(%g2
,#scratch
)
83 REGISTER
(%g3
,#scratch
)
85 define
(`p00
', `%f8') define
(`p16
',`%f10') define
(`p32
',`%f12') define
(`p48
',`%f14')
86 define
(`r32
',`%f16') define
(`r48
',`%f18') define
(`r64
',`%f20') define
(`r80
',`%f22')
87 define
(`v00
',`%f24') define
(`v16
',`%f26') define
(`v32
',`%f28') define
(`v48
',`%f30')
88 define
(`u00
',`%f32') define
(`u32
', `%f34')
89 define
(`a00
',`%f36') define
(`a16
',`%f38') define
(`a32
',`%f40') define
(`a48
',`%f42')
92 define
(`i00
',`%l0') define
(`i16
',`%l1') define
(`i32
',`%l2') define
(`i48
',`%l3')
93 define
(`xffffffff
',`%l7')
98 C Initialization.
(1) Split v operand
into four
16-bit chunks
and store them
99 C as IEEE double
in fp registers.
(2) Clear upper
32 bits of fp register pairs
100 C f2
and f4.
(3) Store masks
in registers aliased to `xffff
' and `xffffffff'.
104 srlx
%g4
, 48, xffff C store
mask in register `xffff
'
106 stx %g2, [%sp+2223+0]
109 stx %g3, [%sp+2223+8]
112 stx %g2, [%sp+2223+16]
114 stx %g3, [%sp+2223+24]
115 srlx %g4, 32, xffffffff C store mask in register `xffffffff'
126 ldd
[%sp+2223+0], v00
127 ldd
[%sp+2223+8], v16
128 ldd
[%sp+2223+16], v32
129 ldd
[%sp+2223+24], v48
130 ld
[%sp+2223+0],%f2 C zero f2
131 ld
[%sp+2223+0],%f4 C zero f4
132 ld
[%i5
+%i2
], %f3 C read
low 32 bits of up
[i
]
133 ld
[%i1
+%i2
], %f5 C read
high 32 bits of up
[i
]
139 C Start real work.
(We sneakingly read f3
and f5 above...
)
140 C The software pipeline is very deep
, requiring
4 feed
-in stages.
150 bnz
,pt
%xcc
, .L_two_or_more
154 fmuld u32
, v32
, r64 C FIXME
not urgent
158 fmuld u32
, v48
, r80 C FIXME
not urgent
162 std a00
, [%sp+2223+0]
163 std a16
, [%sp+2223+8]
164 std a32
, [%sp+2223+16]
165 std a48
, [%sp+2223+24]
170 ldx
[%sp+2223+0], i00
171 ldx
[%sp+2223+8], i16
172 ldx
[%sp+2223+16], i32
173 ldx
[%sp+2223+24], i48
174 std a00
, [%sp+2223+0]
175 std a16
, [%sp+2223+8]
178 mov i00
, %g5 C i00
+ now
in g5
179 ldx
[%sp+2223+0], i00
180 srlx i16
, 48, %l4 C
(i16
>> 48)
182 ldx
[%sp+2223+8], i16
183 srlx i48
, 16, %l5 C
(i48
>> 16)
184 mov i32
, %g4 C i32
+ now
in g4
185 sllx i48
, 32, %l6 C
(i48
<< 32)
186 srlx
%g4
, 32, %o3 C
(i32
>> 32)
187 add %l5
, %l4
, %o1 C hi64
- in %o1
188 std a00
, [%sp+2223+0]
189 sllx
%g4
, 16, %o2 C
(i32
<< 16)
190 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
191 std a16
, [%sp+2223+8]
192 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
193 add %g2
, %o2
, %o2 C mi64
- in %o2
194 add %l6
, %o2
, %o2 C mi64
- in %o2
195 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
196 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
201 ld
[%i5
+%i2
], %f3 C read
low 32 bits of up
[i
]
202 fmuld u32
, v32
, r64 C FIXME
not urgent
204 ld
[%i1
+%i2
], %f5 C read
high 32 bits of up
[i
]
207 fmuld u32
, v48
, r80 C FIXME
not urgent
213 std a00
, [%sp+2223+0]
215 std a16
, [%sp+2223+8]
217 std a32
, [%sp+2223+16]
219 std a48
, [%sp+2223+24]
225 bnz
,pt
%xcc
, .L_three_or_more
229 fmuld u32
, v32
, r64 C FIXME
not urgent
233 fmuld u32
, v48
, r80 C FIXME
not urgent
235 ldx
[%sp+2223+0], i00
237 ldx
[%sp+2223+8], i16
238 ldx
[%sp+2223+16], i32
239 ldx
[%sp+2223+24], i48
241 std a00
, [%sp+2223+0]
242 std a16
, [%sp+2223+8]
243 std a32
, [%sp+2223+16]
244 std a48
, [%sp+2223+24]
248 mov i00
, %g5 C i00
+ now
in g5
250 ldx
[%sp+2223+0], i00
251 srlx i16
, 48, %l4 C
(i16
>> 48)
253 ldx
[%sp+2223+8], i16
254 srlx i48
, 16, %l5 C
(i48
>> 16)
255 mov i32
, %g4 C i32
+ now
in g4
256 ldx
[%sp+2223+16], i32
257 sllx i48
, 32, %l6 C
(i48
<< 32)
258 ldx
[%sp+2223+24], i48
259 srlx
%g4
, 32, %o3 C
(i32
>> 32)
260 add %l5
, %l4
, %o1 C hi64
- in %o1
261 std a00
, [%sp+2223+0]
262 sllx
%g4
, 16, %o2 C
(i32
<< 16)
263 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
264 std a16
, [%sp+2223+8]
265 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
266 add %g2
, %o2
, %o2 C mi64
- in %o2
267 add %l6
, %o2
, %o2 C mi64
- in %o2
268 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
269 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
274 ld
[%i5
+%i2
], %f3 C read
low 32 bits of up
[i
]
275 fmuld u32
, v32
, r64 C FIXME
not urgent
277 ld
[%i1
+%i2
], %f5 C read
high 32 bits of up
[i
]
280 fmuld u32
, v48
, r80 C FIXME
not urgent
282 ldx
[%sp+2223+0], i00
284 ldx
[%sp+2223+8], i16
286 ldx
[%sp+2223+16], i32
288 ldx
[%sp+2223+24], i48
290 std a00
, [%sp+2223+0]
292 std a16
, [%sp+2223+8]
294 std a32
, [%sp+2223+16]
296 std a48
, [%sp+2223+24]
302 bnz
,pt
%xcc
, .L_four_or_more
306 fmuld u32
, v32
, r64 C FIXME
not urgent
310 mov i00
, %g5 C i00
+ now
in g5
311 fmuld u32
, v48
, r80 C FIXME
not urgent
313 ldx
[%sp+2223+0], i00
315 srlx i16
, 48, %l4 C
(i16
>> 48)
317 ldx
[%sp+2223+8], i16
318 srlx i48
, 16, %l5 C
(i48
>> 16)
319 mov i32
, %g4 C i32
+ now
in g4
320 ldx
[%sp+2223+16], i32
321 sllx i48
, 32, %l6 C
(i48
<< 32)
322 ldx
[%sp+2223+24], i48
324 srlx
%g4
, 32, %o3 C
(i32
>> 32)
325 add %l5
, %l4
, %o1 C hi64
- in %o1
326 std a00
, [%sp+2223+0]
327 sllx
%g4
, 16, %o2 C
(i32
<< 16)
328 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
329 std a16
, [%sp+2223+8]
330 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
331 add %g2
, %o2
, %o2 C mi64
- in %o2
332 std a32
, [%sp+2223+16]
333 add %l6
, %o2
, %o2 C mi64
- in %o2
334 std a48
, [%sp+2223+24]
335 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
336 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
341 ld
[%i5
+%i2
], %f3 C read
low 32 bits of up
[i
]
342 fmuld u32
, v32
, r64 C FIXME
not urgent
344 ld
[%i1
+%i2
], %f5 C read
high 32 bits of up
[i
]
347 mov i00
, %g5 C i00
+ now
in g5
348 fmuld u32
, v48
, r80 C FIXME
not urgent
350 ldx
[%sp+2223+0], i00
352 srlx i16
, 48, %l4 C
(i16
>> 48)
354 ldx
[%sp+2223+8], i16
356 srlx i48
, 16, %l5 C
(i48
>> 16)
357 mov i32
, %g4 C i32
+ now
in g4
358 ldx
[%sp+2223+16], i32
360 sllx i48
, 32, %l6 C
(i48
<< 32)
361 ldx
[%sp+2223+24], i48
363 srlx
%g4
, 32, %o3 C
(i32
>> 32)
364 add %l5
, %l4
, %o1 C hi64
- in %o1
365 std a00
, [%sp+2223+0]
367 sllx
%g4
, 16, %o2 C
(i32
<< 16)
368 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
369 std a16
, [%sp+2223+8]
371 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
372 add %g2
, %o2
, %o2 C mi64
- in %o2
373 std a32
, [%sp+2223+16]
375 add %l6
, %o2
, %o2 C mi64
- in %o2
376 std a48
, [%sp+2223+24]
379 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
382 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
394 srlx
%o4
, 16, %o5 C
(x
>> 16)
395 ld
[%i5
+%i2
], %f3 C read
low 32 bits of up
[i
]
396 fmuld u32
, v32
, r64 C FIXME
not urgent
399 add %o5
, %o2
, %o2 C mi64
in %o2
2nd ASSIGNMENT
400 and %o4
, xffff
, %o5 C
(x
& 0xffff)
401 ld
[%i1
+%i2
], %f5 C read
high 32 bits of up
[i
]
406 srlx
%o2
, 48, %o7 C
(mi64
>> 48)
407 mov i00
, %g5 C i00
+ now
in g5
408 fmuld u32
, v48
, r80 C FIXME
not urgent
411 sllx
%o2
, 16, %i3 C
(mi64
<< 16)
412 add %o7
, %o1
, cy C new cy
413 ldx
[%sp+2223+0], i00
416 srlx i16
, 48, %l4 C
(i16
>> 48)
418 ldx
[%sp+2223+8], i16
421 srlx i48
, 16, %l5 C
(i48
>> 16)
422 mov i32
, %g4 C i32
+ now
in g4
423 ldx
[%sp+2223+16], i32
426 sllx i48
, 32, %l6 C
(i48
<< 32)
428 ldx
[%sp+2223+24], i48
431 srlx
%g4
, 32, %o3 C
(i32
>> 32)
432 add %l5
, %l4
, %o1 C hi64
- in %o1
433 std a00
, [%sp+2223+0]
436 sllx
%g4
, 16, %o2 C
(i32
<< 16)
437 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
438 std a16
, [%sp+2223+8]
441 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
442 add %g2
, %o2
, %o2 C mi64
- in %o2
443 std a32
, [%sp+2223+16]
446 add %l6
, %o2
, %o2 C mi64
- in %o2
447 std a48
, [%sp+2223+24]
451 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
456 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
463 srlx
%o4
, 16, %o5 C
(x
>> 16)
464 fmuld u32
, v32
, r64 C FIXME
not urgent
466 add %o5
, %o2
, %o2 C mi64
in %o2
2nd ASSIGNMENT
467 and %o4
, xffff
, %o5 C
(x
& 0xffff)
470 srlx
%o2
, 48, %o7 C
(mi64
>> 48)
471 mov i00
, %g5 C i00
+ now
in g5
472 fmuld u32
, v48
, r80 C FIXME
not urgent
474 sllx
%o2
, 16, %i3 C
(mi64
<< 16)
475 add %o7
, %o1
, cy C new cy
476 ldx
[%sp+2223+0], i00
478 srlx i16
, 48, %l4 C
(i16
>> 48)
480 ldx
[%sp+2223+8], i16
481 srlx i48
, 16, %l5 C
(i48
>> 16)
482 mov i32
, %g4 C i32
+ now
in g4
483 ldx
[%sp+2223+16], i32
484 sllx i48
, 32, %l6 C
(i48
<< 32)
486 ldx
[%sp+2223+24], i48
488 srlx
%g4
, 32, %o3 C
(i32
>> 32)
489 add %l5
, %l4
, %o1 C hi64
- in %o1
490 std a00
, [%sp+2223+0]
491 sllx
%g4
, 16, %o2 C
(i32
<< 16)
492 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
493 std a16
, [%sp+2223+8]
494 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
495 add %g2
, %o2
, %o2 C mi64
- in %o2
496 std a32
, [%sp+2223+16]
497 add %l6
, %o2
, %o2 C mi64
- in %o2
498 std a48
, [%sp+2223+24]
499 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
501 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
504 srlx
%o4
, 16, %o5 C
(x
>> 16)
505 add %o5
, %o2
, %o2 C mi64
in %o2
2nd ASSIGNMENT
506 and %o4
, xffff
, %o5 C
(x
& 0xffff)
508 srlx
%o2
, 48, %o7 C
(mi64
>> 48)
509 mov i00
, %g5 C i00
+ now
in g5
511 sllx
%o2
, 16, %i3 C
(mi64
<< 16)
512 add %o7
, %o1
, cy C new cy
513 ldx
[%sp+2223+0], i00
514 srlx i16
, 48, %l4 C
(i16
>> 48)
516 ldx
[%sp+2223+8], i16
517 srlx i48
, 16, %l5 C
(i48
>> 16)
518 mov i32
, %g4 C i32
+ now
in g4
519 ldx
[%sp+2223+16], i32
520 sllx i48
, 32, %l6 C
(i48
<< 32)
522 ldx
[%sp+2223+24], i48
523 srlx
%g4
, 32, %o3 C
(i32
>> 32)
524 add %l5
, %l4
, %o1 C hi64
- in %o1
525 std a00
, [%sp+2223+0]
526 sllx
%g4
, 16, %o2 C
(i32
<< 16)
527 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
528 std a16
, [%sp+2223+8]
529 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
530 add %g2
, %o2
, %o2 C mi64
- in %o2
531 add %l6
, %o2
, %o2 C mi64
- in %o2
532 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
534 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
537 srlx
%o4
, 16, %o5 C
(x
>> 16)
538 add %o5
, %o2
, %o2 C mi64
in %o2
2nd ASSIGNMENT
539 and %o4
, xffff
, %o5 C
(x
& 0xffff)
540 srlx
%o2
, 48, %o7 C
(mi64
>> 48)
541 mov i00
, %g5 C i00
+ now
in g5
542 sllx
%o2
, 16, %i3 C
(mi64
<< 16)
543 add %o7
, %o1
, cy C new cy
544 ldx
[%sp+2223+0], i00
545 srlx i16
, 48, %l4 C
(i16
>> 48)
547 ldx
[%sp+2223+8], i16
548 srlx i48
, 16, %l5 C
(i48
>> 16)
549 mov i32
, %g4 C i32
+ now
in g4
550 sllx i48
, 32, %l6 C
(i48
<< 32)
552 srlx
%g4
, 32, %o3 C
(i32
>> 32)
553 add %l5
, %l4
, %o1 C hi64
- in %o1
554 sllx
%g4
, 16, %o2 C
(i32
<< 16)
555 add %o3
, %o1
, %o1 C hi64
in %o1
1st ASSIGNMENT
556 sllx
%o1
, 48, %o3 C
(hi64
<< 48)
557 add %g2
, %o2
, %o2 C mi64
- in %o2
558 add %l6
, %o2
, %o2 C mi64
- in %o2
559 sub %o2
, %o3
, %o2 C mi64
in %o2
1st ASSIGNMENT
561 add cy
, %g5
, %o4 C x
= prev
(i00
) + cy
564 srlx
%o4
, 16, %o5 C
(x
>> 16)
565 add %o5
, %o2
, %o2 C mi64
in %o2
2nd ASSIGNMENT
566 and %o4
, xffff
, %o5 C
(x
& 0xffff)
567 srlx
%o2
, 48, %o7 C
(mi64
>> 48)
568 sllx
%o2
, 16, %i3 C
(mi64
<< 16)
569 add %o7
, %o1
, cy C new cy