beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / sparc64 / ultrasparc1234 / mul_1.asm
blob871d562fcb2209cc6c19ec4c8c5416b94daaa0f7
1 dnl SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2 dnl the result in a second limb vector.
4 dnl Copyright 1998, 2000-2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C cycles/limb
35 C UltraSPARC 1&2: 14
36 C UltraSPARC 3: 18.5
38 C Algorithm: We use eight floating-point multiplies per limb product, with the
39 C invariant v operand split into four 16-bit pieces, and the s1 operand split
40 C into 32-bit pieces. We sum pairs of 48-bit partial products using
41 C floating-point add, then convert the four 49-bit product-sums and transfer
42 C them to the integer unit.
44 C Possible optimizations:
45 C 1. Align the stack area where we transfer the four 49-bit product-sums
46 C to a 32-byte boundary. That would minimize the cache collision.
47 C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
48 C be to align the area to map to the area immediately before s1?)
49 C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the
50 C develop mpn_addmul_2. This would save many integer instructions.
51 C 3. Unrolling. Questionable if it is worth the code expansion, given that
52 C it could only save 1 cycle/limb.
53 C 4. Specialize for particular v values. If its upper 32 bits are zero, we
54 C could save many operations, in the FPU (fmuld), but more so in the IEU
55 C since we'll be summing 48-bit quantities, which might be simpler.
56 C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
57 C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should
58 C not be greater than needed for L2 cache latency, and also not so great
59 C that i16 needs to be copied.
60 C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
61 C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU
62 C ops.)
64 C Instruction classification (as per UltraSPARC-1/2 functional units):
65 C 8 FM
66 C 10 FA
67 C 11 MEM
68 C 9 ISHIFT + 10? IADDLOG
69 C 1 BRANCH
70 C 49 insns totally (plus three mov insns that should be optimized out)
72 C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we
73 C sustain 3.79 instructions/cycle.
75 C INPUT PARAMETERS
76 C rp i0
77 C up i1
78 C n i2
79 C v i3
81 ASM_START()
82 REGISTER(%g2,#scratch)
83 REGISTER(%g3,#scratch)
85 define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
86 define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
87 define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
88 define(`u00',`%f32') define(`u32', `%f34')
89 define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
90 define(`cy',`%g1')
91 define(`rlimb',`%g3')
92 define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
93 define(`xffffffff',`%l7')
94 define(`xffff',`%o0')
96 PROLOGUE(mpn_mul_1)
98 C Initialization. (1) Split v operand into four 16-bit chunks and store them
99 C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs
100 C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'.
102 save %sp, -256, %sp
103 mov -1, %g4
104 srlx %g4, 48, xffff C store mask in register `xffff'
105 and %i3, xffff, %g2
106 stx %g2, [%sp+2223+0]
107 srlx %i3, 16, %g3
108 and %g3, xffff, %g3
109 stx %g3, [%sp+2223+8]
110 srlx %i3, 32, %g2
111 and %g2, xffff, %g2
112 stx %g2, [%sp+2223+16]
113 srlx %i3, 48, %g3
114 stx %g3, [%sp+2223+24]
115 srlx %g4, 32, xffffffff C store mask in register `xffffffff'
117 sllx %i2, 3, %i2
118 mov 0, cy C clear cy
119 add %i0, %i2, %i0
120 add %i1, %i2, %i1
121 neg %i2
122 add %i1, 4, %i5
123 add %i0, -32, %i4
124 add %i0, -16, %i0
126 ldd [%sp+2223+0], v00
127 ldd [%sp+2223+8], v16
128 ldd [%sp+2223+16], v32
129 ldd [%sp+2223+24], v48
130 ld [%sp+2223+0],%f2 C zero f2
131 ld [%sp+2223+0],%f4 C zero f4
132 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
133 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
134 fxtod v00, v00
135 fxtod v16, v16
136 fxtod v32, v32
137 fxtod v48, v48
139 C Start real work. (We sneakingly read f3 and f5 above...)
140 C The software pipeline is very deep, requiring 4 feed-in stages.
142 fxtod %f2, u00
143 fxtod %f4, u32
144 fmuld u00, v00, a00
145 fmuld u00, v16, a16
146 fmuld u00, v32, p32
147 fmuld u32, v00, r32
148 fmuld u00, v48, p48
149 addcc %i2, 8, %i2
150 bnz,pt %xcc, .L_two_or_more
151 fmuld u32, v16, r48
153 .L_one:
154 fmuld u32, v32, r64 C FIXME not urgent
155 faddd p32, r32, a32
156 fdtox a00, a00
157 faddd p48, r48, a48
158 fmuld u32, v48, r80 C FIXME not urgent
159 fdtox a16, a16
160 fdtox a32, a32
161 fdtox a48, a48
162 std a00, [%sp+2223+0]
163 std a16, [%sp+2223+8]
164 std a32, [%sp+2223+16]
165 std a48, [%sp+2223+24]
166 add %i2, 8, %i2
168 fdtox r64, a00
169 fdtox r80, a16
170 ldx [%sp+2223+0], i00
171 ldx [%sp+2223+8], i16
172 ldx [%sp+2223+16], i32
173 ldx [%sp+2223+24], i48
174 std a00, [%sp+2223+0]
175 std a16, [%sp+2223+8]
176 add %i2, 8, %i2
178 mov i00, %g5 C i00+ now in g5
179 ldx [%sp+2223+0], i00
180 srlx i16, 48, %l4 C (i16 >> 48)
181 mov i16, %g2
182 ldx [%sp+2223+8], i16
183 srlx i48, 16, %l5 C (i48 >> 16)
184 mov i32, %g4 C i32+ now in g4
185 sllx i48, 32, %l6 C (i48 << 32)
186 srlx %g4, 32, %o3 C (i32 >> 32)
187 add %l5, %l4, %o1 C hi64- in %o1
188 std a00, [%sp+2223+0]
189 sllx %g4, 16, %o2 C (i32 << 16)
190 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
191 std a16, [%sp+2223+8]
192 sllx %o1, 48, %o3 C (hi64 << 48)
193 add %g2, %o2, %o2 C mi64- in %o2
194 add %l6, %o2, %o2 C mi64- in %o2
195 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
196 add cy, %g5, %o4 C x = prev(i00) + cy
197 b .L_out_1
198 add %i2, 8, %i2
200 .L_two_or_more:
201 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
202 fmuld u32, v32, r64 C FIXME not urgent
203 faddd p32, r32, a32
204 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
205 fdtox a00, a00
206 faddd p48, r48, a48
207 fmuld u32, v48, r80 C FIXME not urgent
208 fdtox a16, a16
209 fdtox a32, a32
210 fxtod %f2, u00
211 fxtod %f4, u32
212 fdtox a48, a48
213 std a00, [%sp+2223+0]
214 fmuld u00, v00, p00
215 std a16, [%sp+2223+8]
216 fmuld u00, v16, p16
217 std a32, [%sp+2223+16]
218 fmuld u00, v32, p32
219 std a48, [%sp+2223+24]
220 faddd p00, r64, a00
221 fmuld u32, v00, r32
222 faddd p16, r80, a16
223 fmuld u00, v48, p48
224 addcc %i2, 8, %i2
225 bnz,pt %xcc, .L_three_or_more
226 fmuld u32, v16, r48
228 .L_two:
229 fmuld u32, v32, r64 C FIXME not urgent
230 faddd p32, r32, a32
231 fdtox a00, a00
232 faddd p48, r48, a48
233 fmuld u32, v48, r80 C FIXME not urgent
234 fdtox a16, a16
235 ldx [%sp+2223+0], i00
236 fdtox a32, a32
237 ldx [%sp+2223+8], i16
238 ldx [%sp+2223+16], i32
239 ldx [%sp+2223+24], i48
240 fdtox a48, a48
241 std a00, [%sp+2223+0]
242 std a16, [%sp+2223+8]
243 std a32, [%sp+2223+16]
244 std a48, [%sp+2223+24]
245 add %i2, 8, %i2
247 fdtox r64, a00
248 mov i00, %g5 C i00+ now in g5
249 fdtox r80, a16
250 ldx [%sp+2223+0], i00
251 srlx i16, 48, %l4 C (i16 >> 48)
252 mov i16, %g2
253 ldx [%sp+2223+8], i16
254 srlx i48, 16, %l5 C (i48 >> 16)
255 mov i32, %g4 C i32+ now in g4
256 ldx [%sp+2223+16], i32
257 sllx i48, 32, %l6 C (i48 << 32)
258 ldx [%sp+2223+24], i48
259 srlx %g4, 32, %o3 C (i32 >> 32)
260 add %l5, %l4, %o1 C hi64- in %o1
261 std a00, [%sp+2223+0]
262 sllx %g4, 16, %o2 C (i32 << 16)
263 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
264 std a16, [%sp+2223+8]
265 sllx %o1, 48, %o3 C (hi64 << 48)
266 add %g2, %o2, %o2 C mi64- in %o2
267 add %l6, %o2, %o2 C mi64- in %o2
268 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
269 add cy, %g5, %o4 C x = prev(i00) + cy
270 b .L_out_2
271 add %i2, 8, %i2
273 .L_three_or_more:
274 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
275 fmuld u32, v32, r64 C FIXME not urgent
276 faddd p32, r32, a32
277 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
278 fdtox a00, a00
279 faddd p48, r48, a48
280 fmuld u32, v48, r80 C FIXME not urgent
281 fdtox a16, a16
282 ldx [%sp+2223+0], i00
283 fdtox a32, a32
284 ldx [%sp+2223+8], i16
285 fxtod %f2, u00
286 ldx [%sp+2223+16], i32
287 fxtod %f4, u32
288 ldx [%sp+2223+24], i48
289 fdtox a48, a48
290 std a00, [%sp+2223+0]
291 fmuld u00, v00, p00
292 std a16, [%sp+2223+8]
293 fmuld u00, v16, p16
294 std a32, [%sp+2223+16]
295 fmuld u00, v32, p32
296 std a48, [%sp+2223+24]
297 faddd p00, r64, a00
298 fmuld u32, v00, r32
299 faddd p16, r80, a16
300 fmuld u00, v48, p48
301 addcc %i2, 8, %i2
302 bnz,pt %xcc, .L_four_or_more
303 fmuld u32, v16, r48
305 .L_three:
306 fmuld u32, v32, r64 C FIXME not urgent
307 faddd p32, r32, a32
308 fdtox a00, a00
309 faddd p48, r48, a48
310 mov i00, %g5 C i00+ now in g5
311 fmuld u32, v48, r80 C FIXME not urgent
312 fdtox a16, a16
313 ldx [%sp+2223+0], i00
314 fdtox a32, a32
315 srlx i16, 48, %l4 C (i16 >> 48)
316 mov i16, %g2
317 ldx [%sp+2223+8], i16
318 srlx i48, 16, %l5 C (i48 >> 16)
319 mov i32, %g4 C i32+ now in g4
320 ldx [%sp+2223+16], i32
321 sllx i48, 32, %l6 C (i48 << 32)
322 ldx [%sp+2223+24], i48
323 fdtox a48, a48
324 srlx %g4, 32, %o3 C (i32 >> 32)
325 add %l5, %l4, %o1 C hi64- in %o1
326 std a00, [%sp+2223+0]
327 sllx %g4, 16, %o2 C (i32 << 16)
328 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
329 std a16, [%sp+2223+8]
330 sllx %o1, 48, %o3 C (hi64 << 48)
331 add %g2, %o2, %o2 C mi64- in %o2
332 std a32, [%sp+2223+16]
333 add %l6, %o2, %o2 C mi64- in %o2
334 std a48, [%sp+2223+24]
335 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
336 add cy, %g5, %o4 C x = prev(i00) + cy
337 b .L_out_3
338 add %i2, 8, %i2
340 .L_four_or_more:
341 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
342 fmuld u32, v32, r64 C FIXME not urgent
343 faddd p32, r32, a32
344 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
345 fdtox a00, a00
346 faddd p48, r48, a48
347 mov i00, %g5 C i00+ now in g5
348 fmuld u32, v48, r80 C FIXME not urgent
349 fdtox a16, a16
350 ldx [%sp+2223+0], i00
351 fdtox a32, a32
352 srlx i16, 48, %l4 C (i16 >> 48)
353 mov i16, %g2
354 ldx [%sp+2223+8], i16
355 fxtod %f2, u00
356 srlx i48, 16, %l5 C (i48 >> 16)
357 mov i32, %g4 C i32+ now in g4
358 ldx [%sp+2223+16], i32
359 fxtod %f4, u32
360 sllx i48, 32, %l6 C (i48 << 32)
361 ldx [%sp+2223+24], i48
362 fdtox a48, a48
363 srlx %g4, 32, %o3 C (i32 >> 32)
364 add %l5, %l4, %o1 C hi64- in %o1
365 std a00, [%sp+2223+0]
366 fmuld u00, v00, p00
367 sllx %g4, 16, %o2 C (i32 << 16)
368 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
369 std a16, [%sp+2223+8]
370 fmuld u00, v16, p16
371 sllx %o1, 48, %o3 C (hi64 << 48)
372 add %g2, %o2, %o2 C mi64- in %o2
373 std a32, [%sp+2223+16]
374 fmuld u00, v32, p32
375 add %l6, %o2, %o2 C mi64- in %o2
376 std a48, [%sp+2223+24]
377 faddd p00, r64, a00
378 fmuld u32, v00, r32
379 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
380 faddd p16, r80, a16
381 fmuld u00, v48, p48
382 add cy, %g5, %o4 C x = prev(i00) + cy
383 addcc %i2, 8, %i2
384 bnz,pt %xcc, .Loop
385 fmuld u32, v16, r48
387 .L_four:
388 b,a .L_out_4
390 C BEGIN MAIN LOOP
391 .align 16
392 .Loop:
393 C 00
394 srlx %o4, 16, %o5 C (x >> 16)
395 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
396 fmuld u32, v32, r64 C FIXME not urgent
397 faddd p32, r32, a32
398 C 01
399 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
400 and %o4, xffff, %o5 C (x & 0xffff)
401 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
402 fdtox a00, a00
403 C 02
404 faddd p48, r48, a48
405 C 03
406 srlx %o2, 48, %o7 C (mi64 >> 48)
407 mov i00, %g5 C i00+ now in g5
408 fmuld u32, v48, r80 C FIXME not urgent
409 fdtox a16, a16
410 C 04
411 sllx %o2, 16, %i3 C (mi64 << 16)
412 add %o7, %o1, cy C new cy
413 ldx [%sp+2223+0], i00
414 fdtox a32, a32
415 C 05
416 srlx i16, 48, %l4 C (i16 >> 48)
417 mov i16, %g2
418 ldx [%sp+2223+8], i16
419 fxtod %f2, u00
420 C 06
421 srlx i48, 16, %l5 C (i48 >> 16)
422 mov i32, %g4 C i32+ now in g4
423 ldx [%sp+2223+16], i32
424 fxtod %f4, u32
425 C 07
426 sllx i48, 32, %l6 C (i48 << 32)
427 or %i3, %o5, %o5
428 ldx [%sp+2223+24], i48
429 fdtox a48, a48
430 C 08
431 srlx %g4, 32, %o3 C (i32 >> 32)
432 add %l5, %l4, %o1 C hi64- in %o1
433 std a00, [%sp+2223+0]
434 fmuld u00, v00, p00
435 C 09
436 sllx %g4, 16, %o2 C (i32 << 16)
437 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
438 std a16, [%sp+2223+8]
439 fmuld u00, v16, p16
440 C 10
441 sllx %o1, 48, %o3 C (hi64 << 48)
442 add %g2, %o2, %o2 C mi64- in %o2
443 std a32, [%sp+2223+16]
444 fmuld u00, v32, p32
445 C 11
446 add %l6, %o2, %o2 C mi64- in %o2
447 std a48, [%sp+2223+24]
448 faddd p00, r64, a00
449 fmuld u32, v00, r32
450 C 12
451 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
452 stx %o5, [%i4+%i2]
453 faddd p16, r80, a16
454 fmuld u00, v48, p48
455 C 13
456 add cy, %g5, %o4 C x = prev(i00) + cy
457 addcc %i2, 8, %i2
458 bnz,pt %xcc, .Loop
459 fmuld u32, v16, r48
460 C END MAIN LOOP
462 .L_out_4:
463 srlx %o4, 16, %o5 C (x >> 16)
464 fmuld u32, v32, r64 C FIXME not urgent
465 faddd p32, r32, a32
466 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
467 and %o4, xffff, %o5 C (x & 0xffff)
468 fdtox a00, a00
469 faddd p48, r48, a48
470 srlx %o2, 48, %o7 C (mi64 >> 48)
471 mov i00, %g5 C i00+ now in g5
472 fmuld u32, v48, r80 C FIXME not urgent
473 fdtox a16, a16
474 sllx %o2, 16, %i3 C (mi64 << 16)
475 add %o7, %o1, cy C new cy
476 ldx [%sp+2223+0], i00
477 fdtox a32, a32
478 srlx i16, 48, %l4 C (i16 >> 48)
479 mov i16, %g2
480 ldx [%sp+2223+8], i16
481 srlx i48, 16, %l5 C (i48 >> 16)
482 mov i32, %g4 C i32+ now in g4
483 ldx [%sp+2223+16], i32
484 sllx i48, 32, %l6 C (i48 << 32)
485 or %i3, %o5, %o5
486 ldx [%sp+2223+24], i48
487 fdtox a48, a48
488 srlx %g4, 32, %o3 C (i32 >> 32)
489 add %l5, %l4, %o1 C hi64- in %o1
490 std a00, [%sp+2223+0]
491 sllx %g4, 16, %o2 C (i32 << 16)
492 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
493 std a16, [%sp+2223+8]
494 sllx %o1, 48, %o3 C (hi64 << 48)
495 add %g2, %o2, %o2 C mi64- in %o2
496 std a32, [%sp+2223+16]
497 add %l6, %o2, %o2 C mi64- in %o2
498 std a48, [%sp+2223+24]
499 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
500 stx %o5, [%i4+%i2]
501 add cy, %g5, %o4 C x = prev(i00) + cy
502 add %i2, 8, %i2
503 .L_out_3:
504 srlx %o4, 16, %o5 C (x >> 16)
505 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
506 and %o4, xffff, %o5 C (x & 0xffff)
507 fdtox r64, a00
508 srlx %o2, 48, %o7 C (mi64 >> 48)
509 mov i00, %g5 C i00+ now in g5
510 fdtox r80, a16
511 sllx %o2, 16, %i3 C (mi64 << 16)
512 add %o7, %o1, cy C new cy
513 ldx [%sp+2223+0], i00
514 srlx i16, 48, %l4 C (i16 >> 48)
515 mov i16, %g2
516 ldx [%sp+2223+8], i16
517 srlx i48, 16, %l5 C (i48 >> 16)
518 mov i32, %g4 C i32+ now in g4
519 ldx [%sp+2223+16], i32
520 sllx i48, 32, %l6 C (i48 << 32)
521 or %i3, %o5, %o5
522 ldx [%sp+2223+24], i48
523 srlx %g4, 32, %o3 C (i32 >> 32)
524 add %l5, %l4, %o1 C hi64- in %o1
525 std a00, [%sp+2223+0]
526 sllx %g4, 16, %o2 C (i32 << 16)
527 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
528 std a16, [%sp+2223+8]
529 sllx %o1, 48, %o3 C (hi64 << 48)
530 add %g2, %o2, %o2 C mi64- in %o2
531 add %l6, %o2, %o2 C mi64- in %o2
532 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
533 stx %o5, [%i4+%i2]
534 add cy, %g5, %o4 C x = prev(i00) + cy
535 add %i2, 8, %i2
536 .L_out_2:
537 srlx %o4, 16, %o5 C (x >> 16)
538 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
539 and %o4, xffff, %o5 C (x & 0xffff)
540 srlx %o2, 48, %o7 C (mi64 >> 48)
541 mov i00, %g5 C i00+ now in g5
542 sllx %o2, 16, %i3 C (mi64 << 16)
543 add %o7, %o1, cy C new cy
544 ldx [%sp+2223+0], i00
545 srlx i16, 48, %l4 C (i16 >> 48)
546 mov i16, %g2
547 ldx [%sp+2223+8], i16
548 srlx i48, 16, %l5 C (i48 >> 16)
549 mov i32, %g4 C i32+ now in g4
550 sllx i48, 32, %l6 C (i48 << 32)
551 or %i3, %o5, %o5
552 srlx %g4, 32, %o3 C (i32 >> 32)
553 add %l5, %l4, %o1 C hi64- in %o1
554 sllx %g4, 16, %o2 C (i32 << 16)
555 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
556 sllx %o1, 48, %o3 C (hi64 << 48)
557 add %g2, %o2, %o2 C mi64- in %o2
558 add %l6, %o2, %o2 C mi64- in %o2
559 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
560 stx %o5, [%i4+%i2]
561 add cy, %g5, %o4 C x = prev(i00) + cy
562 add %i2, 8, %i2
563 .L_out_1:
564 srlx %o4, 16, %o5 C (x >> 16)
565 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
566 and %o4, xffff, %o5 C (x & 0xffff)
567 srlx %o2, 48, %o7 C (mi64 >> 48)
568 sllx %o2, 16, %i3 C (mi64 << 16)
569 add %o7, %o1, cy C new cy
570 or %i3, %o5, %o5
571 stx %o5, [%i4+%i2]
573 sllx i00, 0, %g2
574 add %g2, cy, cy
575 sllx i16, 16, %g3
576 add %g3, cy, cy
578 return %i7+8
579 mov cy, %o0
580 EPILOGUE(mpn_mul_1)