beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / sparc64 / ultrasparc1234 / addmul_1.asm
blob48a94146ff618581e03a5fabb33e6403b5e437d5
1 dnl SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
2 dnl the result to a second limb vector.
4 dnl Copyright 1998, 2000-2004 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C cycles/limb
35 C UltraSPARC 1&2: 14
36 C UltraSPARC 3: 17.5
38 C Algorithm: We use eight floating-point multiplies per limb product, with the
39 C invariant v operand split into four 16-bit pieces, and the up operand split
40 C into 32-bit pieces. We sum pairs of 48-bit partial products using
41 C floating-point add, then convert the four 49-bit product-sums and transfer
42 C them to the integer unit.
44 C Possible optimizations:
45 C 0. Rewrite to use algorithm of mpn_addmul_2.
46 C 1. Align the stack area where we transfer the four 49-bit product-sums
47 C to a 32-byte boundary. That would minimize the cache collision.
48 C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
49 C be to align the area to map to the area immediately before up?)
50 C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the
51 C develop mpn_addmul_2. This would save many integer instructions.
52 C 3. Unrolling. Questionable if it is worth the code expansion, given that
53 C it could only save 1 cycle/limb.
54 C 4. Specialize for particular v values. If its upper 32 bits are zero, we
55 C could save many operations, in the FPU (fmuld), but more so in the IEU
56 C since we'll be summing 48-bit quantities, which might be simpler.
57 C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
58 C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should
59 C not be greater than needed for L2 cache latency, and also not so great
60 C that i16 needs to be copied.
61 C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
62 C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU
63 C ops.)
65 C Instruction classification (as per UltraSPARC-1/2 functional units):
66 C 8 FM
67 C 10 FA
68 C 12 MEM
69 C 10 ISHIFT + 14 IADDLOG
70 C 1 BRANCH
71 C 55 insns totally (plus one mov insn that should be optimized out)
73 C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we
74 C sustain the peak execution rate of 4 instructions/cycle.
76 C INPUT PARAMETERS
77 C rp i0
78 C up i1
79 C n i2
80 C v i3
82 ASM_START()
83 REGISTER(%g2,#scratch)
84 REGISTER(%g3,#scratch)
86 define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
87 define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
88 define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
89 define(`u00',`%f32') define(`u32', `%f34')
90 define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
91 define(`cy',`%g1')
92 define(`rlimb',`%g3')
93 define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
94 define(`xffffffff',`%l7')
95 define(`xffff',`%o0')
97 PROLOGUE(mpn_addmul_1)
99 C Initialization. (1) Split v operand into four 16-bit chunks and store them
100 C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs
101 C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'.
103 save %sp, -256, %sp
104 mov -1, %g4
105 srlx %g4, 48, xffff C store mask in register `xffff'
106 and %i3, xffff, %g2
107 stx %g2, [%sp+2223+0]
108 srlx %i3, 16, %g3
109 and %g3, xffff, %g3
110 stx %g3, [%sp+2223+8]
111 srlx %i3, 32, %g2
112 and %g2, xffff, %g2
113 stx %g2, [%sp+2223+16]
114 srlx %i3, 48, %g3
115 stx %g3, [%sp+2223+24]
116 srlx %g4, 32, xffffffff C store mask in register `xffffffff'
118 sllx %i2, 3, %i2
119 mov 0, cy C clear cy
120 add %i0, %i2, %i0
121 add %i1, %i2, %i1
122 neg %i2
123 add %i1, 4, %i5
124 add %i0, -32, %i4
125 add %i0, -16, %i0
127 ldd [%sp+2223+0], v00
128 ldd [%sp+2223+8], v16
129 ldd [%sp+2223+16], v32
130 ldd [%sp+2223+24], v48
131 ld [%sp+2223+0],%f2 C zero f2
132 ld [%sp+2223+0],%f4 C zero f4
133 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
134 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
135 fxtod v00, v00
136 fxtod v16, v16
137 fxtod v32, v32
138 fxtod v48, v48
140 C Start real work. (We sneakingly read f3 and f5 above...)
141 C The software pipeline is very deep, requiring 4 feed-in stages.
143 fxtod %f2, u00
144 fxtod %f4, u32
145 fmuld u00, v00, a00
146 fmuld u00, v16, a16
147 fmuld u00, v32, p32
148 fmuld u32, v00, r32
149 fmuld u00, v48, p48
150 addcc %i2, 8, %i2
151 bnz,pt %xcc, .L_two_or_more
152 fmuld u32, v16, r48
154 .L_one:
155 fmuld u32, v32, r64 C FIXME not urgent
156 faddd p32, r32, a32
157 fdtox a00, a00
158 faddd p48, r48, a48
159 fmuld u32, v48, r80 C FIXME not urgent
160 fdtox a16, a16
161 fdtox a32, a32
162 fdtox a48, a48
163 std a00, [%sp+2223+0]
164 std a16, [%sp+2223+8]
165 std a32, [%sp+2223+16]
166 std a48, [%sp+2223+24]
167 add %i2, 8, %i2
169 fdtox r64, a00
170 ldx [%i0+%i2], rlimb C read rp[i]
171 fdtox r80, a16
172 ldx [%sp+2223+0], i00
173 ldx [%sp+2223+8], i16
174 ldx [%sp+2223+16], i32
175 ldx [%sp+2223+24], i48
176 std a00, [%sp+2223+0]
177 std a16, [%sp+2223+8]
178 add %i2, 8, %i2
180 srlx rlimb, 32, %g4 C HI(rlimb)
181 and rlimb, xffffffff, %g5 C LO(rlimb)
182 add i00, %g5, %g5 C i00+ now in g5
183 ldx [%sp+2223+0], i00
184 srlx i16, 48, %l4 C (i16 >> 48)
185 mov i16, %g2
186 ldx [%sp+2223+8], i16
187 srlx i48, 16, %l5 C (i48 >> 16)
188 add i32, %g4, %g4 C i32+ now in g4
189 sllx i48, 32, %l6 C (i48 << 32)
190 srlx %g4, 32, %o3 C (i32 >> 32)
191 add %l5, %l4, %o1 C hi64- in %o1
192 std a00, [%sp+2223+0]
193 sllx %g4, 16, %o2 C (i32 << 16)
194 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
195 std a16, [%sp+2223+8]
196 sllx %o1, 48, %o3 C (hi64 << 48)
197 add %g2, %o2, %o2 C mi64- in %o2
198 add %l6, %o2, %o2 C mi64- in %o2
199 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
200 add cy, %g5, %o4 C x = prev(i00) + cy
201 b .L_out_1
202 add %i2, 8, %i2
204 .L_two_or_more:
205 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
206 fmuld u32, v32, r64 C FIXME not urgent
207 faddd p32, r32, a32
208 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
209 fdtox a00, a00
210 faddd p48, r48, a48
211 fmuld u32, v48, r80 C FIXME not urgent
212 fdtox a16, a16
213 fdtox a32, a32
214 fxtod %f2, u00
215 fxtod %f4, u32
216 fdtox a48, a48
217 std a00, [%sp+2223+0]
218 fmuld u00, v00, p00
219 std a16, [%sp+2223+8]
220 fmuld u00, v16, p16
221 std a32, [%sp+2223+16]
222 fmuld u00, v32, p32
223 std a48, [%sp+2223+24]
224 faddd p00, r64, a00
225 fmuld u32, v00, r32
226 faddd p16, r80, a16
227 fmuld u00, v48, p48
228 addcc %i2, 8, %i2
229 bnz,pt %xcc, .L_three_or_more
230 fmuld u32, v16, r48
232 .L_two:
233 fmuld u32, v32, r64 C FIXME not urgent
234 faddd p32, r32, a32
235 fdtox a00, a00
236 ldx [%i0+%i2], rlimb C read rp[i]
237 faddd p48, r48, a48
238 fmuld u32, v48, r80 C FIXME not urgent
239 fdtox a16, a16
240 ldx [%sp+2223+0], i00
241 fdtox a32, a32
242 ldx [%sp+2223+8], i16
243 ldx [%sp+2223+16], i32
244 ldx [%sp+2223+24], i48
245 fdtox a48, a48
246 std a00, [%sp+2223+0]
247 std a16, [%sp+2223+8]
248 std a32, [%sp+2223+16]
249 std a48, [%sp+2223+24]
250 add %i2, 8, %i2
252 fdtox r64, a00
253 srlx rlimb, 32, %g4 C HI(rlimb)
254 and rlimb, xffffffff, %g5 C LO(rlimb)
255 ldx [%i0+%i2], rlimb C read rp[i]
256 add i00, %g5, %g5 C i00+ now in g5
257 fdtox r80, a16
258 ldx [%sp+2223+0], i00
259 srlx i16, 48, %l4 C (i16 >> 48)
260 mov i16, %g2
261 ldx [%sp+2223+8], i16
262 srlx i48, 16, %l5 C (i48 >> 16)
263 add i32, %g4, %g4 C i32+ now in g4
264 ldx [%sp+2223+16], i32
265 sllx i48, 32, %l6 C (i48 << 32)
266 ldx [%sp+2223+24], i48
267 srlx %g4, 32, %o3 C (i32 >> 32)
268 add %l5, %l4, %o1 C hi64- in %o1
269 std a00, [%sp+2223+0]
270 sllx %g4, 16, %o2 C (i32 << 16)
271 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
272 std a16, [%sp+2223+8]
273 sllx %o1, 48, %o3 C (hi64 << 48)
274 add %g2, %o2, %o2 C mi64- in %o2
275 add %l6, %o2, %o2 C mi64- in %o2
276 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
277 add cy, %g5, %o4 C x = prev(i00) + cy
278 b .L_out_2
279 add %i2, 8, %i2
281 .L_three_or_more:
282 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
283 fmuld u32, v32, r64 C FIXME not urgent
284 faddd p32, r32, a32
285 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
286 fdtox a00, a00
287 ldx [%i0+%i2], rlimb C read rp[i]
288 faddd p48, r48, a48
289 fmuld u32, v48, r80 C FIXME not urgent
290 fdtox a16, a16
291 ldx [%sp+2223+0], i00
292 fdtox a32, a32
293 ldx [%sp+2223+8], i16
294 fxtod %f2, u00
295 ldx [%sp+2223+16], i32
296 fxtod %f4, u32
297 ldx [%sp+2223+24], i48
298 fdtox a48, a48
299 std a00, [%sp+2223+0]
300 fmuld u00, v00, p00
301 std a16, [%sp+2223+8]
302 fmuld u00, v16, p16
303 std a32, [%sp+2223+16]
304 fmuld u00, v32, p32
305 std a48, [%sp+2223+24]
306 faddd p00, r64, a00
307 fmuld u32, v00, r32
308 faddd p16, r80, a16
309 fmuld u00, v48, p48
310 addcc %i2, 8, %i2
311 bnz,pt %xcc, .L_four_or_more
312 fmuld u32, v16, r48
314 .L_three:
315 fmuld u32, v32, r64 C FIXME not urgent
316 faddd p32, r32, a32
317 fdtox a00, a00
318 srlx rlimb, 32, %g4 C HI(rlimb)
319 and rlimb, xffffffff, %g5 C LO(rlimb)
320 ldx [%i0+%i2], rlimb C read rp[i]
321 faddd p48, r48, a48
322 add i00, %g5, %g5 C i00+ now in g5
323 fmuld u32, v48, r80 C FIXME not urgent
324 fdtox a16, a16
325 ldx [%sp+2223+0], i00
326 fdtox a32, a32
327 srlx i16, 48, %l4 C (i16 >> 48)
328 mov i16, %g2
329 ldx [%sp+2223+8], i16
330 srlx i48, 16, %l5 C (i48 >> 16)
331 add i32, %g4, %g4 C i32+ now in g4
332 ldx [%sp+2223+16], i32
333 sllx i48, 32, %l6 C (i48 << 32)
334 ldx [%sp+2223+24], i48
335 fdtox a48, a48
336 srlx %g4, 32, %o3 C (i32 >> 32)
337 add %l5, %l4, %o1 C hi64- in %o1
338 std a00, [%sp+2223+0]
339 sllx %g4, 16, %o2 C (i32 << 16)
340 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
341 std a16, [%sp+2223+8]
342 sllx %o1, 48, %o3 C (hi64 << 48)
343 add %g2, %o2, %o2 C mi64- in %o2
344 std a32, [%sp+2223+16]
345 add %l6, %o2, %o2 C mi64- in %o2
346 std a48, [%sp+2223+24]
347 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
348 add cy, %g5, %o4 C x = prev(i00) + cy
349 b .L_out_3
350 add %i2, 8, %i2
352 .L_four_or_more:
353 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
354 fmuld u32, v32, r64 C FIXME not urgent
355 faddd p32, r32, a32
356 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
357 fdtox a00, a00
358 srlx rlimb, 32, %g4 C HI(rlimb)
359 and rlimb, xffffffff, %g5 C LO(rlimb)
360 ldx [%i0+%i2], rlimb C read rp[i]
361 faddd p48, r48, a48
362 add i00, %g5, %g5 C i00+ now in g5
363 fmuld u32, v48, r80 C FIXME not urgent
364 fdtox a16, a16
365 ldx [%sp+2223+0], i00
366 fdtox a32, a32
367 srlx i16, 48, %l4 C (i16 >> 48)
368 mov i16, %g2
369 ldx [%sp+2223+8], i16
370 fxtod %f2, u00
371 srlx i48, 16, %l5 C (i48 >> 16)
372 add i32, %g4, %g4 C i32+ now in g4
373 ldx [%sp+2223+16], i32
374 fxtod %f4, u32
375 sllx i48, 32, %l6 C (i48 << 32)
376 ldx [%sp+2223+24], i48
377 fdtox a48, a48
378 srlx %g4, 32, %o3 C (i32 >> 32)
379 add %l5, %l4, %o1 C hi64- in %o1
380 std a00, [%sp+2223+0]
381 fmuld u00, v00, p00
382 sllx %g4, 16, %o2 C (i32 << 16)
383 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
384 std a16, [%sp+2223+8]
385 fmuld u00, v16, p16
386 sllx %o1, 48, %o3 C (hi64 << 48)
387 add %g2, %o2, %o2 C mi64- in %o2
388 std a32, [%sp+2223+16]
389 fmuld u00, v32, p32
390 add %l6, %o2, %o2 C mi64- in %o2
391 std a48, [%sp+2223+24]
392 faddd p00, r64, a00
393 fmuld u32, v00, r32
394 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
395 faddd p16, r80, a16
396 fmuld u00, v48, p48
397 add cy, %g5, %o4 C x = prev(i00) + cy
398 addcc %i2, 8, %i2
399 bnz,pt %xcc, .Loop
400 fmuld u32, v16, r48
402 .L_four:
403 b,a .L_out_4
405 C BEGIN MAIN LOOP
406 .align 16
407 .Loop:
408 C 00
409 srlx %o4, 16, %o5 C (x >> 16)
410 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
411 fmuld u32, v32, r64 C FIXME not urgent
412 faddd p32, r32, a32
413 C 01
414 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
415 and %o4, xffff, %o5 C (x & 0xffff)
416 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
417 fdtox a00, a00
418 C 02
419 srlx rlimb, 32, %g4 C HI(rlimb)
420 and rlimb, xffffffff, %g5 C LO(rlimb)
421 ldx [%i0+%i2], rlimb C read rp[i]
422 faddd p48, r48, a48
423 C 03
424 srlx %o2, 48, %o7 C (mi64 >> 48)
425 add i00, %g5, %g5 C i00+ now in g5
426 fmuld u32, v48, r80 C FIXME not urgent
427 fdtox a16, a16
428 C 04
429 sllx %o2, 16, %i3 C (mi64 << 16)
430 add %o7, %o1, cy C new cy
431 ldx [%sp+2223+0], i00
432 fdtox a32, a32
433 C 05
434 srlx i16, 48, %l4 C (i16 >> 48)
435 mov i16, %g2
436 ldx [%sp+2223+8], i16
437 fxtod %f2, u00
438 C 06
439 srlx i48, 16, %l5 C (i48 >> 16)
440 add i32, %g4, %g4 C i32+ now in g4
441 ldx [%sp+2223+16], i32
442 fxtod %f4, u32
443 C 07
444 sllx i48, 32, %l6 C (i48 << 32)
445 or %i3, %o5, %o5
446 ldx [%sp+2223+24], i48
447 fdtox a48, a48
448 C 08
449 srlx %g4, 32, %o3 C (i32 >> 32)
450 add %l5, %l4, %o1 C hi64- in %o1
451 std a00, [%sp+2223+0]
452 fmuld u00, v00, p00
453 C 09
454 sllx %g4, 16, %o2 C (i32 << 16)
455 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
456 std a16, [%sp+2223+8]
457 fmuld u00, v16, p16
458 C 10
459 sllx %o1, 48, %o3 C (hi64 << 48)
460 add %g2, %o2, %o2 C mi64- in %o2
461 std a32, [%sp+2223+16]
462 fmuld u00, v32, p32
463 C 11
464 add %l6, %o2, %o2 C mi64- in %o2
465 std a48, [%sp+2223+24]
466 faddd p00, r64, a00
467 fmuld u32, v00, r32
468 C 12
469 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
470 stx %o5, [%i4+%i2]
471 faddd p16, r80, a16
472 fmuld u00, v48, p48
473 C 13
474 add cy, %g5, %o4 C x = prev(i00) + cy
475 addcc %i2, 8, %i2
476 bnz,pt %xcc, .Loop
477 fmuld u32, v16, r48
478 C END MAIN LOOP
480 .L_out_4:
481 srlx %o4, 16, %o5 C (x >> 16)
482 fmuld u32, v32, r64 C FIXME not urgent
483 faddd p32, r32, a32
484 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
485 and %o4, xffff, %o5 C (x & 0xffff)
486 fdtox a00, a00
487 srlx rlimb, 32, %g4 C HI(rlimb)
488 and rlimb, xffffffff, %g5 C LO(rlimb)
489 ldx [%i0+%i2], rlimb C read rp[i]
490 faddd p48, r48, a48
491 srlx %o2, 48, %o7 C (mi64 >> 48)
492 add i00, %g5, %g5 C i00+ now in g5
493 fmuld u32, v48, r80 C FIXME not urgent
494 fdtox a16, a16
495 sllx %o2, 16, %i3 C (mi64 << 16)
496 add %o7, %o1, cy C new cy
497 ldx [%sp+2223+0], i00
498 fdtox a32, a32
499 srlx i16, 48, %l4 C (i16 >> 48)
500 mov i16, %g2
501 ldx [%sp+2223+8], i16
502 srlx i48, 16, %l5 C (i48 >> 16)
503 add i32, %g4, %g4 C i32+ now in g4
504 ldx [%sp+2223+16], i32
505 sllx i48, 32, %l6 C (i48 << 32)
506 or %i3, %o5, %o5
507 ldx [%sp+2223+24], i48
508 fdtox a48, a48
509 srlx %g4, 32, %o3 C (i32 >> 32)
510 add %l5, %l4, %o1 C hi64- in %o1
511 std a00, [%sp+2223+0]
512 sllx %g4, 16, %o2 C (i32 << 16)
513 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
514 std a16, [%sp+2223+8]
515 sllx %o1, 48, %o3 C (hi64 << 48)
516 add %g2, %o2, %o2 C mi64- in %o2
517 std a32, [%sp+2223+16]
518 add %l6, %o2, %o2 C mi64- in %o2
519 std a48, [%sp+2223+24]
520 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
521 stx %o5, [%i4+%i2]
522 add cy, %g5, %o4 C x = prev(i00) + cy
523 add %i2, 8, %i2
524 .L_out_3:
525 srlx %o4, 16, %o5 C (x >> 16)
526 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
527 and %o4, xffff, %o5 C (x & 0xffff)
528 fdtox r64, a00
529 srlx rlimb, 32, %g4 C HI(rlimb)
530 and rlimb, xffffffff, %g5 C LO(rlimb)
531 ldx [%i0+%i2], rlimb C read rp[i]
532 srlx %o2, 48, %o7 C (mi64 >> 48)
533 add i00, %g5, %g5 C i00+ now in g5
534 fdtox r80, a16
535 sllx %o2, 16, %i3 C (mi64 << 16)
536 add %o7, %o1, cy C new cy
537 ldx [%sp+2223+0], i00
538 srlx i16, 48, %l4 C (i16 >> 48)
539 mov i16, %g2
540 ldx [%sp+2223+8], i16
541 srlx i48, 16, %l5 C (i48 >> 16)
542 add i32, %g4, %g4 C i32+ now in g4
543 ldx [%sp+2223+16], i32
544 sllx i48, 32, %l6 C (i48 << 32)
545 or %i3, %o5, %o5
546 ldx [%sp+2223+24], i48
547 srlx %g4, 32, %o3 C (i32 >> 32)
548 add %l5, %l4, %o1 C hi64- in %o1
549 std a00, [%sp+2223+0]
550 sllx %g4, 16, %o2 C (i32 << 16)
551 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
552 std a16, [%sp+2223+8]
553 sllx %o1, 48, %o3 C (hi64 << 48)
554 add %g2, %o2, %o2 C mi64- in %o2
555 add %l6, %o2, %o2 C mi64- in %o2
556 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
557 stx %o5, [%i4+%i2]
558 add cy, %g5, %o4 C x = prev(i00) + cy
559 add %i2, 8, %i2
560 .L_out_2:
561 srlx %o4, 16, %o5 C (x >> 16)
562 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
563 and %o4, xffff, %o5 C (x & 0xffff)
564 srlx rlimb, 32, %g4 C HI(rlimb)
565 and rlimb, xffffffff, %g5 C LO(rlimb)
566 srlx %o2, 48, %o7 C (mi64 >> 48)
567 add i00, %g5, %g5 C i00+ now in g5
568 sllx %o2, 16, %i3 C (mi64 << 16)
569 add %o7, %o1, cy C new cy
570 ldx [%sp+2223+0], i00
571 srlx i16, 48, %l4 C (i16 >> 48)
572 mov i16, %g2
573 ldx [%sp+2223+8], i16
574 srlx i48, 16, %l5 C (i48 >> 16)
575 add i32, %g4, %g4 C i32+ now in g4
576 sllx i48, 32, %l6 C (i48 << 32)
577 or %i3, %o5, %o5
578 srlx %g4, 32, %o3 C (i32 >> 32)
579 add %l5, %l4, %o1 C hi64- in %o1
580 sllx %g4, 16, %o2 C (i32 << 16)
581 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
582 sllx %o1, 48, %o3 C (hi64 << 48)
583 add %g2, %o2, %o2 C mi64- in %o2
584 add %l6, %o2, %o2 C mi64- in %o2
585 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
586 stx %o5, [%i4+%i2]
587 add cy, %g5, %o4 C x = prev(i00) + cy
588 add %i2, 8, %i2
589 .L_out_1:
590 srlx %o4, 16, %o5 C (x >> 16)
591 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
592 and %o4, xffff, %o5 C (x & 0xffff)
593 srlx %o2, 48, %o7 C (mi64 >> 48)
594 sllx %o2, 16, %i3 C (mi64 << 16)
595 add %o7, %o1, cy C new cy
596 or %i3, %o5, %o5
597 stx %o5, [%i4+%i2]
599 sllx i00, 0, %g2
600 add %g2, cy, cy
601 sllx i16, 16, %g3
602 add %g3, cy, cy
604 return %i7+8
605 mov cy, %o0
606 EPILOGUE(mpn_addmul_1)