1 dnl SPARC v9
64-bit mpn_addmul_2
-- Multiply an n limb number with
2-limb
2 dnl number
and add the result to a n limb vector.
4 dnl Copyright
2002, 2003 Free Software Foundation
, Inc.
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
38 C Algorithm: We use 16 floating-point multiplies per limb product, with the
39 C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand
40 C split into 32-bit pieces. We sum four 48-bit partial products using
41 C floating-point add, then convert the resulting four 50-bit quantities and
42 C transfer them to the integer unit.
44 C Possible optimizations:
45 C 1. Align the stack area where we transfer the four 50-bit product-sums
46 C to a 32-byte boundary. That would minimize the cache collision.
47 C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
48 C be to align the area to map to the area immediately before up?)
49 C 2. Perform two of the fp->int conversions with integer instructions. We
50 C can get almost ten free IEU slots, if we clean up bookkeeping and the
51 C silly carry-limb code.
52 C 3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb
55 C OSP (Overlapping software pipeline) version of mpn_mul_basecase:
56 C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles.
62 C Instruction classification (as per UltraSPARC functional units).
63 C Assuming silly carry code is fixed. Includes bookkeeping.
65 C mpn_addmul_X mpn_mul_X
67 C ========== ==========
75 C TOTAL IEU 17 17 16 16
78 C IEU cycles 8.5 8.5 8 8
79 C MEM cycles 12 12 10 10
80 C ISSUE cycles 12 16 11.25 15.25
81 C FPU cycles 10 18 10 18
82 C cycles/loop 12 18 12 18
83 C cycles/limb 12 9 12 9
94 REGISTER(%g2,#scratch)
95 REGISTER(%g3,#scratch)
105 define(`p000', `
%f8
') define(`p016',`
%f10
')
106 define(`p032',`
%f12
') define(`p048',`
%f14
')
107 define(`p064',`
%f16
') define(`p080',`
%f18
')
108 define(`p096a',`
%f20
') define(`p112a',`
%f22
')
109 define(`p096b',`
%f56
') define(`p112b',`
%f58
')
111 define(`out000',`
%f0
') define(`out016',`
%f6
')
113 define(`v000',`
%f24
') define(`v016',`
%f26
')
114 define(`v032',`
%f28
') define(`v048',`
%f30
')
115 define(`v064',`
%f44
') define(`v080',`
%f46
')
116 define(`v096',`
%f48
') define(`v112',`
%f50
')
118 define(`u00',`
%f32
') define(`u32', `
%f34
')
120 define(`a000',`
%f36
') define(`a016',`
%f38
')
121 define(`a032',`
%f40
') define(`a048',`
%f42
')
122 define(`a064',`
%f60
') define(`a080',`
%f62
')
124 define(`u00_hi',`
%f2
') define(`u32_hi',`
%f4
')
125 define(`u00_lo',`
%f3
') define(`u32_lo',`
%f5
')
128 define(`rlimb',`
%g3
')
129 define(`i00',`
%l0
') define(`i16',`
%l1
')
130 define(`r00',`
%l2
') define(`r32',`
%l3
')
131 define(`xffffffff',`
%l7
')
132 define(`xffff',`
%o0
')
135 PROLOGUE(mpn_addmul_2)
137 C Initialization. (1) Split v operand into eight 16-bit chunks and store them
138 C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs
139 C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff
'.
140 C This code could be better scheduled.
147 srlx
%g4
, 32, xffffffff C store
mask in register `xffffffff
'
148 ldda [%i3+6] %asi, v000
149 ldda [%i3+4] %asi, v016
150 ldda [%i3+2] %asi, v032
151 ldda [%i3+0] %asi, v048
153 ldda [%i3+14] %asi, v064
155 ldda [%i3+12] %asi, v080
157 ldda [%i3+10] %asi, v096
159 ldda [%i3+8] %asi, v112
168 ldx
[%i3
+0], %l0 C vp
[0]
169 srlx
%g4
, 48, xffff C store
mask in register `xffff
'
170 ldx [%i3+8], %l1 C vp[1]
173 stx %g2, [%sp+2223+0]
176 stx %g3, [%sp+2223+8]
179 stx %g2, [%sp+2223+16]
181 stx %g3, [%sp+2223+24]
183 stx %g2, [%sp+2223+32]
186 stx %g3, [%sp+2223+40]
189 stx %g2, [%sp+2223+48]
191 stx %g3, [%sp+2223+56]
193 srlx %g4, 32, xffffffff C store mask in register `xffffffff'
195 ldd
[%sp+2223+0], v000
196 ldd
[%sp+2223+8], v016
197 ldd
[%sp+2223+16], v032
198 ldd
[%sp+2223+24], v048
200 ldd
[%sp+2223+32], v064
202 ldd
[%sp+2223+40], v080
204 ldd
[%sp+2223+48], v096
206 ldd
[%sp+2223+56], v112
208 ld
[%sp+2223+0], u00_hi C zero u00_hi
210 ld
[%sp+2223+0], u32_hi C zero u32_hi
214 C Initialization done.
218 add %i0, -8, %i0 C BOOKKEEPING
220 C Start software pipeline.
222 ld [%i1+4], u00_lo C read low 32 bits of up[i]
225 ld [%i1+0], u32_lo C read high 32 bits of up[i]
226 fmuld u00, v000, a000
227 fmuld u00, v016, a016
228 fmuld u00, v032, a032
229 fmuld u00, v048, a048
230 add %i2, -1, %i2 C BOOKKEEPING
231 fmuld u00, v064, p064
232 add %i1, 8, %i1 C BOOKKEEPING
234 fmuld u00, v080, p080
235 fmuld u00, v096, p096a
236 brnz,pt %i2, .L_2_or_more
237 fmuld u00, v112, p112a
239 .L1: fdtox a000, out000
240 fmuld u32, v000, p000
242 fmuld u32, v016, p016
244 fmuld u32, v032, p032
246 fmuld u32, v048, p048
247 std out000, [%sp+2223+16]
248 faddd p000, a032, a000
249 fmuld u32, v064, p064
250 std out016, [%sp+2223+24]
252 faddd p016, a048, a016
253 fmuld u32, v080, p080
254 faddd p032, a064, a032
255 fmuld u32, v096, p096b
256 faddd p048, a080, a048
257 fmuld u32, v112, p112b
261 faddd p064, p096a, a064
262 faddd p080, p112a, a080
263 std out000, [%sp+2223+0]
265 std out016, [%sp+2223+8]
268 ld [%i1+4], u00_lo C read low 32 bits of up[i]
270 fmuld u32, v000, p000
272 fmuld u32, v016, p016
274 fmuld u32, v032, p032
276 fmuld u32, v048, p048
277 std out000, [%sp+2223+16]
278 faddd p000, a032, a000
279 fmuld u32, v064, p064
280 std out016, [%sp+2223+24]
282 faddd p016, a048, a016
283 fmuld u32, v080, p080
284 faddd p032, a064, a032
285 fmuld u32, v096, p096b
286 faddd p048, a080, a048
287 fmuld u32, v112, p112b
289 ld [%i1+0], u32_lo C read high 32 bits of up[i]
291 fmuld u00, v000, p000
293 fmuld u00, v016, p016
294 faddd p064, p096a, a064
295 fmuld u00, v032, p032
296 faddd p080, p112a, a080
297 fmuld u00, v048, p048
298 add %i2, -1, %i2 C BOOKKEEPING
299 std out000, [%sp+2223+0]
300 faddd p000, a032, a000
301 fmuld u00, v064, p064
302 add %i1, 8, %i1 C BOOKKEEPING
303 std out016, [%sp+2223+8]
305 faddd p016, a048, a016
306 fmuld u00, v080, p080
307 faddd p032, a064, a032
308 fmuld u00, v096, p096a
309 faddd p048, a080, a048
310 brnz,pt %i2, .L_3_or_more
311 fmuld u00, v112, p112a
320 C . |_______i00__| 50
321 C |_______i16__| . 50
327 .Loop: ld [%i1+4], u00_lo C read low 32 bits of up[i]
328 and %g2, xffffffff, %g2
330 fmuld u32, v000, p000
332 lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
335 fmuld u32, v016, p016
338 ldx [%sp+2223+16], i00
339 faddd p064, p096b, a064
340 fmuld u32, v032, p032
342 add %g4, cy, cy C new cy
343 ldx [%sp+2223+24], i16
344 faddd p080, p112b, a080
345 fmuld u32, v048, p048
348 std out000, [%sp+2223+16]
349 faddd p000, a032, a000
350 fmuld u32, v064, p064
353 add %i0, 8, %i0 C BOOKKEEPING
354 std out016, [%sp+2223+24]
359 faddd p016, a048, a016
360 fmuld u32, v080, p080
364 faddd p032, a064, a032
365 fmuld u32, v096, p096b
369 faddd p048, a080, a048
370 fmuld u32, v112, p112b
372 ld [%i1+0], u32_lo C read high 32 bits of up[i]
373 and %g2, xffffffff, %g2
375 fmuld u00, v000, p000
377 lduw [%i0+0], r32 C read high 32 bits of rp[i]
380 fmuld u00, v016, p016
383 ldx [%sp+2223+0], i00
384 faddd p064, p096a, a064
385 fmuld u00, v032, p032
387 add %g4, cy, cy C new cy
388 ldx [%sp+2223+8], i16
389 faddd p080, p112a, a080
390 fmuld u00, v048, p048
392 add %i2, -1, %i2 C BOOKKEEPING
393 std out000, [%sp+2223+0]
394 faddd p000, a032, a000
395 fmuld u00, v064, p064
398 add %i1, 8, %i1 C BOOKKEEPING
399 std out016, [%sp+2223+8]
404 faddd p016, a048, a016
405 fmuld u00, v080, p080
409 faddd p032, a064, a032
410 fmuld u00, v096, p096a
413 faddd p048, a080, a048
415 fmuld u00, v112, p112a
419 .Lend: and %g2, xffffffff, %g2
421 fmuld u32, v000, p000
422 lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
425 fmuld u32, v016, p016
427 ldx [%sp+2223+16], i00
428 faddd p064, p096b, a064
429 fmuld u32, v032, p032
430 add %g4, cy, cy C new cy
431 ldx [%sp+2223+24], i16
432 faddd p080, p112b, a080
433 fmuld u32, v048, p048
434 std out000, [%sp+2223+16]
435 faddd p000, a032, a000
436 fmuld u32, v064, p064
438 add %i0, 8, %i0 C BOOKKEEPING
439 std out016, [%sp+2223+24]
442 faddd p016, a048, a016
443 fmuld u32, v080, p080
446 faddd p032, a064, a032
447 fmuld u32, v096, p096b
449 faddd p048, a080, a048
450 fmuld u32, v112, p112b
452 and %g2, xffffffff, %g2
454 lduw [%i0+0], r32 C read high 32 bits of rp[i]
458 ldx [%sp+2223+0], i00
459 faddd p064, p096a, a064
460 add %g4, cy, cy C new cy
461 ldx [%sp+2223+8], i16
462 faddd p080, p112a, a080
463 std out000, [%sp+2223+0]
465 std out016, [%sp+2223+8]
473 .L_wd2: and %g2, xffffffff, %g2
475 lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
479 ldx [%sp+2223+16], i00
480 add %g4, cy, cy C new cy
481 ldx [%sp+2223+24], i16
482 std out000, [%sp+2223+16]
484 add %i0, 8, %i0 C BOOKKEEPING
485 std out016, [%sp+2223+24]
492 and %g2, xffffffff, %g2
494 lduw [%i0+0], r32 C read high 32 bits of rp[i]
498 ldx [%sp+2223+0], i00
499 add %g4, cy, cy C new cy
500 ldx [%sp+2223+8], i16
501 std out000, [%sp+2223+0]
503 std out016, [%sp+2223+8]
511 .L_wd3: and %g2, xffffffff, %g2
516 ldx [%sp+2223+16], rlimb
517 add %g4, cy, cy C new cy
518 ldx [%sp+2223+24], i16
519 std out000, [%sp+2223+16]
520 add %i0, 8, %i0 C BOOKKEEPING
521 std out016, [%sp+2223+24]
528 and %g2, xffffffff, %g2
531 ldx [%sp+2223+0], rlimb
532 add %g4, cy, cy C new cy
533 ldx [%sp+2223+8], i16
540 and %g2, xffffffff, %g2
543 ldx [%sp+2223+16], i00
544 add %g4, cy, cy C new cy
545 ldx [%sp+2223+24], i16
551 EPILOGUE(mpn_addmul_2)