beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / sparc64 / ultrasparc1234 / addmul_2.asm
blob37674d7423bfef876e1cc2dd1e53778f08d88f5f
1 dnl SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb
2 dnl number and add the result to a n limb vector.
4 dnl Copyright 2002, 2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C cycles/limb
35 C UltraSPARC 1&2: 9
36 C UltraSPARC 3: 10
38 C Algorithm: We use 16 floating-point multiplies per limb product, with the
39 C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand
40 C split into 32-bit pieces. We sum four 48-bit partial products using
41 C floating-point add, then convert the resulting four 50-bit quantities and
42 C transfer them to the integer unit.
44 C Possible optimizations:
45 C 1. Align the stack area where we transfer the four 50-bit product-sums
46 C to a 32-byte boundary. That would minimize the cache collision.
47 C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
48 C be to align the area to map to the area immediately before up?)
49 C 2. Perform two of the fp->int conversions with integer instructions. We
50 C can get almost ten free IEU slots, if we clean up bookkeeping and the
51 C silly carry-limb code.
52 C 3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb
53 C code.
55 C OSP (Overlapping software pipeline) version of mpn_mul_basecase:
56 C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles.
57 C FI = 20
58 C L = 9 x un * vn
59 C WDFI = 10 x vn / 2
60 C WD = 4
62 C Instruction classification (as per UltraSPARC functional units).
63 C Assuming silly carry code is fixed. Includes bookkeeping.
65 C mpn_addmul_X mpn_mul_X
66 C 1 2 1 2
67 C ========== ==========
68 C FM 8 16 8 16
69 C FA 10 18 10 18
70 C MEM 12 12 10 10
71 C ISHIFT 6 6 6 6
72 C IADDLOG 11 11 10 10
73 C BRANCH 1 1 1 1
75 C TOTAL IEU 17 17 16 16
76 C TOTAL 48 64 45 61
78 C IEU cycles 8.5 8.5 8 8
79 C MEM cycles 12 12 10 10
80 C ISSUE cycles 12 16 11.25 15.25
81 C FPU cycles 10 18 10 18
82 C cycles/loop 12 18 12 18
83 C cycles/limb 12 9 12 9
86 C INPUT PARAMETERS
87 C rp[n + 1] i0
88 C up[n] i1
89 C n i2
90 C vp[2] i3
93 ASM_START()
94 REGISTER(%g2,#scratch)
95 REGISTER(%g3,#scratch)
97 C Combine registers:
98 C u00_hi= u32_hi
99 C u00_lo= u32_lo
100 C a000 = out000
101 C a016 = out016
102 C Free: f52 f54
105 define(`p000', `%f8') define(`p016',`%f10')
106 define(`p032',`%f12') define(`p048',`%f14')
107 define(`p064',`%f16') define(`p080',`%f18')
108 define(`p096a',`%f20') define(`p112a',`%f22')
109 define(`p096b',`%f56') define(`p112b',`%f58')
111 define(`out000',`%f0') define(`out016',`%f6')
113 define(`v000',`%f24') define(`v016',`%f26')
114 define(`v032',`%f28') define(`v048',`%f30')
115 define(`v064',`%f44') define(`v080',`%f46')
116 define(`v096',`%f48') define(`v112',`%f50')
118 define(`u00',`%f32') define(`u32', `%f34')
120 define(`a000',`%f36') define(`a016',`%f38')
121 define(`a032',`%f40') define(`a048',`%f42')
122 define(`a064',`%f60') define(`a080',`%f62')
124 define(`u00_hi',`%f2') define(`u32_hi',`%f4')
125 define(`u00_lo',`%f3') define(`u32_lo',`%f5')
127 define(`cy',`%g1')
128 define(`rlimb',`%g3')
129 define(`i00',`%l0') define(`i16',`%l1')
130 define(`r00',`%l2') define(`r32',`%l3')
131 define(`xffffffff',`%l7')
132 define(`xffff',`%o0')
135 PROLOGUE(mpn_addmul_2)
137 C Initialization. (1) Split v operand into eight 16-bit chunks and store them
138 C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs
139 C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'.
140 C This code could be better scheduled.
142 save %sp, -256, %sp
144 ifdef(`HAVE_VIS',
145 ` mov -1, %g4
146 wr %g0, 0xD2, %asi
147 srlx %g4, 32, xffffffff C store mask in register `xffffffff'
148 ldda [%i3+6] %asi, v000
149 ldda [%i3+4] %asi, v016
150 ldda [%i3+2] %asi, v032
151 ldda [%i3+0] %asi, v048
152 fxtod v000, v000
153 ldda [%i3+14] %asi, v064
154 fxtod v016, v016
155 ldda [%i3+12] %asi, v080
156 fxtod v032, v032
157 ldda [%i3+10] %asi, v096
158 fxtod v048, v048
159 ldda [%i3+8] %asi, v112
160 fxtod v064, v064
161 fxtod v080, v080
162 fxtod v096, v096
163 fxtod v112, v112
164 fzero u00_hi
165 fzero u32_hi
167 ` mov -1, %g4
168 ldx [%i3+0], %l0 C vp[0]
169 srlx %g4, 48, xffff C store mask in register `xffff'
170 ldx [%i3+8], %l1 C vp[1]
172 and %l0, xffff, %g2
173 stx %g2, [%sp+2223+0]
174 srlx %l0, 16, %g3
175 and %g3, xffff, %g3
176 stx %g3, [%sp+2223+8]
177 srlx %l0, 32, %g2
178 and %g2, xffff, %g2
179 stx %g2, [%sp+2223+16]
180 srlx %l0, 48, %g3
181 stx %g3, [%sp+2223+24]
182 and %l1, xffff, %g2
183 stx %g2, [%sp+2223+32]
184 srlx %l1, 16, %g3
185 and %g3, xffff, %g3
186 stx %g3, [%sp+2223+40]
187 srlx %l1, 32, %g2
188 and %g2, xffff, %g2
189 stx %g2, [%sp+2223+48]
190 srlx %l1, 48, %g3
191 stx %g3, [%sp+2223+56]
193 srlx %g4, 32, xffffffff C store mask in register `xffffffff'
195 ldd [%sp+2223+0], v000
196 ldd [%sp+2223+8], v016
197 ldd [%sp+2223+16], v032
198 ldd [%sp+2223+24], v048
199 fxtod v000, v000
200 ldd [%sp+2223+32], v064
201 fxtod v016, v016
202 ldd [%sp+2223+40], v080
203 fxtod v032, v032
204 ldd [%sp+2223+48], v096
205 fxtod v048, v048
206 ldd [%sp+2223+56], v112
207 fxtod v064, v064
208 ld [%sp+2223+0], u00_hi C zero u00_hi
209 fxtod v080, v080
210 ld [%sp+2223+0], u32_hi C zero u32_hi
211 fxtod v096, v096
212 fxtod v112, v112
214 C Initialization done.
215 mov 0, %g2
216 mov 0, rlimb
217 mov 0, %g4
218 add %i0, -8, %i0 C BOOKKEEPING
220 C Start software pipeline.
222 ld [%i1+4], u00_lo C read low 32 bits of up[i]
223 fxtod u00_hi, u00
224 C mid
225 ld [%i1+0], u32_lo C read high 32 bits of up[i]
226 fmuld u00, v000, a000
227 fmuld u00, v016, a016
228 fmuld u00, v032, a032
229 fmuld u00, v048, a048
230 add %i2, -1, %i2 C BOOKKEEPING
231 fmuld u00, v064, p064
232 add %i1, 8, %i1 C BOOKKEEPING
233 fxtod u32_hi, u32
234 fmuld u00, v080, p080
235 fmuld u00, v096, p096a
236 brnz,pt %i2, .L_2_or_more
237 fmuld u00, v112, p112a
239 .L1: fdtox a000, out000
240 fmuld u32, v000, p000
241 fdtox a016, out016
242 fmuld u32, v016, p016
243 fmovd p064, a064
244 fmuld u32, v032, p032
245 fmovd p080, a080
246 fmuld u32, v048, p048
247 std out000, [%sp+2223+16]
248 faddd p000, a032, a000
249 fmuld u32, v064, p064
250 std out016, [%sp+2223+24]
251 fxtod u00_hi, u00
252 faddd p016, a048, a016
253 fmuld u32, v080, p080
254 faddd p032, a064, a032
255 fmuld u32, v096, p096b
256 faddd p048, a080, a048
257 fmuld u32, v112, p112b
258 C mid
259 fdtox a000, out000
260 fdtox a016, out016
261 faddd p064, p096a, a064
262 faddd p080, p112a, a080
263 std out000, [%sp+2223+0]
264 b .L_wd2
265 std out016, [%sp+2223+8]
267 .L_2_or_more:
268 ld [%i1+4], u00_lo C read low 32 bits of up[i]
269 fdtox a000, out000
270 fmuld u32, v000, p000
271 fdtox a016, out016
272 fmuld u32, v016, p016
273 fmovd p064, a064
274 fmuld u32, v032, p032
275 fmovd p080, a080
276 fmuld u32, v048, p048
277 std out000, [%sp+2223+16]
278 faddd p000, a032, a000
279 fmuld u32, v064, p064
280 std out016, [%sp+2223+24]
281 fxtod u00_hi, u00
282 faddd p016, a048, a016
283 fmuld u32, v080, p080
284 faddd p032, a064, a032
285 fmuld u32, v096, p096b
286 faddd p048, a080, a048
287 fmuld u32, v112, p112b
288 C mid
289 ld [%i1+0], u32_lo C read high 32 bits of up[i]
290 fdtox a000, out000
291 fmuld u00, v000, p000
292 fdtox a016, out016
293 fmuld u00, v016, p016
294 faddd p064, p096a, a064
295 fmuld u00, v032, p032
296 faddd p080, p112a, a080
297 fmuld u00, v048, p048
298 add %i2, -1, %i2 C BOOKKEEPING
299 std out000, [%sp+2223+0]
300 faddd p000, a032, a000
301 fmuld u00, v064, p064
302 add %i1, 8, %i1 C BOOKKEEPING
303 std out016, [%sp+2223+8]
304 fxtod u32_hi, u32
305 faddd p016, a048, a016
306 fmuld u00, v080, p080
307 faddd p032, a064, a032
308 fmuld u00, v096, p096a
309 faddd p048, a080, a048
310 brnz,pt %i2, .L_3_or_more
311 fmuld u00, v112, p112a
313 b .Lend
316 C 64 32 0
317 C . . .
318 C . |__rXXX_| 32
319 C . |___cy___| 34
320 C . |_______i00__| 50
321 C |_______i16__| . 50
324 C BEGIN MAIN LOOP
325 .align 16
326 .L_3_or_more:
327 .Loop: ld [%i1+4], u00_lo C read low 32 bits of up[i]
328 and %g2, xffffffff, %g2
329 fdtox a000, out000
330 fmuld u32, v000, p000
332 lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
333 add %g2, rlimb, %l5
334 fdtox a016, out016
335 fmuld u32, v016, p016
337 srlx %l5, 32, cy
338 ldx [%sp+2223+16], i00
339 faddd p064, p096b, a064
340 fmuld u32, v032, p032
342 add %g4, cy, cy C new cy
343 ldx [%sp+2223+24], i16
344 faddd p080, p112b, a080
345 fmuld u32, v048, p048
348 std out000, [%sp+2223+16]
349 faddd p000, a032, a000
350 fmuld u32, v064, p064
352 add i00, r00, rlimb
353 add %i0, 8, %i0 C BOOKKEEPING
354 std out016, [%sp+2223+24]
355 fxtod u00_hi, u00
357 sllx i16, 16, %g2
358 add cy, rlimb, rlimb
359 faddd p016, a048, a016
360 fmuld u32, v080, p080
362 srlx i16, 16, %g4
363 add %g2, rlimb, %l5
364 faddd p032, a064, a032
365 fmuld u32, v096, p096b
367 stw %l5, [%i0+4]
369 faddd p048, a080, a048
370 fmuld u32, v112, p112b
371 C midloop
372 ld [%i1+0], u32_lo C read high 32 bits of up[i]
373 and %g2, xffffffff, %g2
374 fdtox a000, out000
375 fmuld u00, v000, p000
377 lduw [%i0+0], r32 C read high 32 bits of rp[i]
378 add %g2, rlimb, %l5
379 fdtox a016, out016
380 fmuld u00, v016, p016
382 srlx %l5, 32, cy
383 ldx [%sp+2223+0], i00
384 faddd p064, p096a, a064
385 fmuld u00, v032, p032
387 add %g4, cy, cy C new cy
388 ldx [%sp+2223+8], i16
389 faddd p080, p112a, a080
390 fmuld u00, v048, p048
392 add %i2, -1, %i2 C BOOKKEEPING
393 std out000, [%sp+2223+0]
394 faddd p000, a032, a000
395 fmuld u00, v064, p064
397 add i00, r32, rlimb
398 add %i1, 8, %i1 C BOOKKEEPING
399 std out016, [%sp+2223+8]
400 fxtod u32_hi, u32
402 sllx i16, 16, %g2
403 add cy, rlimb, rlimb
404 faddd p016, a048, a016
405 fmuld u00, v080, p080
407 srlx i16, 16, %g4
408 add %g2, rlimb, %l5
409 faddd p032, a064, a032
410 fmuld u00, v096, p096a
412 stw %l5, [%i0+0]
413 faddd p048, a080, a048
414 brnz,pt %i2, .Loop
415 fmuld u00, v112, p112a
416 C END MAIN LOOP
418 C WIND-DOWN PHASE 1
419 .Lend: and %g2, xffffffff, %g2
420 fdtox a000, out000
421 fmuld u32, v000, p000
422 lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
423 add %g2, rlimb, %l5
424 fdtox a016, out016
425 fmuld u32, v016, p016
426 srlx %l5, 32, cy
427 ldx [%sp+2223+16], i00
428 faddd p064, p096b, a064
429 fmuld u32, v032, p032
430 add %g4, cy, cy C new cy
431 ldx [%sp+2223+24], i16
432 faddd p080, p112b, a080
433 fmuld u32, v048, p048
434 std out000, [%sp+2223+16]
435 faddd p000, a032, a000
436 fmuld u32, v064, p064
437 add i00, r00, rlimb
438 add %i0, 8, %i0 C BOOKKEEPING
439 std out016, [%sp+2223+24]
440 sllx i16, 16, %g2
441 add cy, rlimb, rlimb
442 faddd p016, a048, a016
443 fmuld u32, v080, p080
444 srlx i16, 16, %g4
445 add %g2, rlimb, %l5
446 faddd p032, a064, a032
447 fmuld u32, v096, p096b
448 stw %l5, [%i0+4]
449 faddd p048, a080, a048
450 fmuld u32, v112, p112b
451 C mid
452 and %g2, xffffffff, %g2
453 fdtox a000, out000
454 lduw [%i0+0], r32 C read high 32 bits of rp[i]
455 add %g2, rlimb, %l5
456 fdtox a016, out016
457 srlx %l5, 32, cy
458 ldx [%sp+2223+0], i00
459 faddd p064, p096a, a064
460 add %g4, cy, cy C new cy
461 ldx [%sp+2223+8], i16
462 faddd p080, p112a, a080
463 std out000, [%sp+2223+0]
464 add i00, r32, rlimb
465 std out016, [%sp+2223+8]
466 sllx i16, 16, %g2
467 add cy, rlimb, rlimb
468 srlx i16, 16, %g4
469 add %g2, rlimb, %l5
470 stw %l5, [%i0+0]
472 C WIND-DOWN PHASE 2
473 .L_wd2: and %g2, xffffffff, %g2
474 fdtox a032, out000
475 lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
476 add %g2, rlimb, %l5
477 fdtox a048, out016
478 srlx %l5, 32, cy
479 ldx [%sp+2223+16], i00
480 add %g4, cy, cy C new cy
481 ldx [%sp+2223+24], i16
482 std out000, [%sp+2223+16]
483 add i00, r00, rlimb
484 add %i0, 8, %i0 C BOOKKEEPING
485 std out016, [%sp+2223+24]
486 sllx i16, 16, %g2
487 add cy, rlimb, rlimb
488 srlx i16, 16, %g4
489 add %g2, rlimb, %l5
490 stw %l5, [%i0+4]
491 C mid
492 and %g2, xffffffff, %g2
493 fdtox a064, out000
494 lduw [%i0+0], r32 C read high 32 bits of rp[i]
495 add %g2, rlimb, %l5
496 fdtox a080, out016
497 srlx %l5, 32, cy
498 ldx [%sp+2223+0], i00
499 add %g4, cy, cy C new cy
500 ldx [%sp+2223+8], i16
501 std out000, [%sp+2223+0]
502 add i00, r32, rlimb
503 std out016, [%sp+2223+8]
504 sllx i16, 16, %g2
505 add cy, rlimb, rlimb
506 srlx i16, 16, %g4
507 add %g2, rlimb, %l5
508 stw %l5, [%i0+0]
510 C WIND-DOWN PHASE 3
511 .L_wd3: and %g2, xffffffff, %g2
512 fdtox p096b, out000
513 add %g2, rlimb, %l5
514 fdtox p112b, out016
515 srlx %l5, 32, cy
516 ldx [%sp+2223+16], rlimb
517 add %g4, cy, cy C new cy
518 ldx [%sp+2223+24], i16
519 std out000, [%sp+2223+16]
520 add %i0, 8, %i0 C BOOKKEEPING
521 std out016, [%sp+2223+24]
522 sllx i16, 16, %g2
523 add cy, rlimb, rlimb
524 srlx i16, 16, %g4
525 add %g2, rlimb, %l5
526 stw %l5, [%i0+4]
527 C mid
528 and %g2, xffffffff, %g2
529 add %g2, rlimb, %l5
530 srlx %l5, 32, cy
531 ldx [%sp+2223+0], rlimb
532 add %g4, cy, cy C new cy
533 ldx [%sp+2223+8], i16
534 sllx i16, 16, %g2
535 add cy, rlimb, rlimb
536 srlx i16, 16, %g4
537 add %g2, rlimb, %l5
538 stw %l5, [%i0+0]
540 and %g2, xffffffff, %g2
541 add %g2, rlimb, %l5
542 srlx %l5, 32, cy
543 ldx [%sp+2223+16], i00
544 add %g4, cy, cy C new cy
545 ldx [%sp+2223+24], i16
547 sllx i16, 16, %g2
548 add i00, cy, cy
549 return %i7+8
550 add %g2, cy, %o0
551 EPILOGUE(mpn_addmul_2)