beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / pa64 / addmul_1.asm
blob2cb9af9f147a3c5e2b3e13a23be066d24df2dd78
1 dnl HP-PA 2.0 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
2 dnl add the result to a second limb vector.
4 dnl Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C cycles/limb
35 C 8000,8200: 7
36 C 8500,8600,8700: 6.375
38 C The feed-in and wind-down code has not yet been scheduled. Many cycles
39 C could be saved there per call.
41 C DESCRIPTION:
42 C The main loop "BIG" is 4-way unrolled, mainly to allow
43 C effective use of ADD,DC. Delays in moving data via the cache from the FP
44 C registers to the IU registers, have demanded a deep software pipeline, and
45 C a lot of stack slots for partial products in flight.
47 C CODE STRUCTURE:
48 C save-some-registers
49 C do 0, 1, 2, or 3 limbs
50 C if done, restore-some-regs and return
51 C save-many-regs
52 C do 4, 8, ... limb
53 C restore-all-regs
55 C STACK LAYOUT:
56 C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
57 C slots marked FREE, as well as some slots in the caller's "frame marker".
59 C -00 <- r30
60 C -08 FREE
61 C -10 tmp
62 C -18 tmp
63 C -20 tmp
64 C -28 tmp
65 C -30 tmp
66 C -38 tmp
67 C -40 tmp
68 C -48 tmp
69 C -50 tmp
70 C -58 tmp
71 C -60 tmp
72 C -68 tmp
73 C -70 tmp
74 C -78 tmp
75 C -80 tmp
76 C -88 tmp
77 C -90 FREE
78 C -98 FREE
79 C -a0 FREE
80 C -a8 FREE
81 C -b0 r13
82 C -b8 r12
83 C -c0 r11
84 C -c8 r10
85 C -d0 r8
86 C -d8 r8
87 C -e0 r7
88 C -e8 r6
89 C -f0 r5
90 C -f8 r4
91 C -100 r3
92 C Previous frame:
93 C [unused area]
94 C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
97 include(`../config.m4')
99 C INPUT PARAMETERS:
100 define(`rp',`%r26') C
101 define(`up',`%r25') C
102 define(`n',`%r24') C
103 define(`vlimb',`%r23') C
105 define(`climb',`%r23') C
107 ifdef(`HAVE_ABI_2_0w',
108 ` .level 2.0w
109 ',` .level 2.0
111 PROLOGUE(mpn_addmul_1)
113 ifdef(`HAVE_ABI_2_0w',
114 ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
116 std,ma %r3, 0x100(%r30)
117 std %r4, -0xf8(%r30)
118 std %r5, -0xf0(%r30)
119 ldo 0(%r0), climb C clear climb
120 fldd -0x138(%r30), %fr8 C put vlimb in fp register
122 define(`p032a1',`%r1') C
123 define(`p032a2',`%r19') C
125 define(`m032',`%r20') C
126 define(`m096',`%r21') C
128 define(`p000a',`%r22') C
129 define(`p064a',`%r29') C
131 define(`s000',`%r31') C
133 define(`ma000',`%r4') C
134 define(`ma064',`%r20') C
136 define(`r000',`%r3') C
138 extrd,u n, 63, 2, %r5
139 cmpb,= %r5, %r0, L(BIG)
142 fldd 0(up), %fr4
143 ldo 8(up), up
144 xmpyu %fr8R, %fr4L, %fr22
145 xmpyu %fr8L, %fr4R, %fr23
146 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
147 xmpyu %fr8R, %fr4R, %fr24
148 xmpyu %fr8L, %fr4L, %fr25
149 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
150 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
151 addib,<> -1, %r5, L(two_or_more)
152 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
153 LDEF(one)
154 ldd -0x78(%r30), p032a1
155 ldd -0x70(%r30), p032a2
156 ldd -0x80(%r30), p000a
157 b L(0_one_out)
158 ldd -0x68(%r30), p064a
160 LDEF(two_or_more)
161 fldd 0(up), %fr4
162 ldo 8(up), up
163 xmpyu %fr8R, %fr4L, %fr22
164 xmpyu %fr8L, %fr4R, %fr23
165 ldd -0x78(%r30), p032a1
166 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
167 xmpyu %fr8R, %fr4R, %fr24
168 xmpyu %fr8L, %fr4L, %fr25
169 ldd -0x70(%r30), p032a2
170 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
171 ldd -0x80(%r30), p000a
172 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
173 ldd -0x68(%r30), p064a
174 addib,<> -1, %r5, L(three_or_more)
175 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
176 LDEF(two)
177 add p032a1, p032a2, m032
178 add,dc %r0, %r0, m096
179 depd,z m032, 31, 32, ma000
180 extrd,u m032, 31, 32, ma064
181 ldd 0(rp), r000
182 b L(0_two_out)
183 depd m096, 31, 32, ma064
185 LDEF(three_or_more)
186 fldd 0(up), %fr4
187 add p032a1, p032a2, m032
188 add,dc %r0, %r0, m096
189 depd,z m032, 31, 32, ma000
190 extrd,u m032, 31, 32, ma064
191 ldd 0(rp), r000
192 C addib,= -1, %r5, L(0_out)
193 depd m096, 31, 32, ma064
194 LDEF(loop0)
195 C xmpyu %fr8R, %fr4L, %fr22
196 C xmpyu %fr8L, %fr4R, %fr23
197 C ldd -0x78(%r30), p032a1
198 C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
200 C xmpyu %fr8R, %fr4R, %fr24
201 C xmpyu %fr8L, %fr4L, %fr25
202 C ldd -0x70(%r30), p032a2
203 C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
205 C ldo 8(rp), rp
206 C add climb, p000a, s000
207 C ldd -0x80(%r30), p000a
208 C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
210 C add,dc p064a, %r0, climb
211 C ldo 8(up), up
212 C ldd -0x68(%r30), p064a
213 C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
215 C add ma000, s000, s000
216 C add,dc ma064, climb, climb
217 C fldd 0(up), %fr4
219 C add r000, s000, s000
220 C add,dc %r0, climb, climb
221 C std s000, -8(rp)
223 C add p032a1, p032a2, m032
224 C add,dc %r0, %r0, m096
226 C depd,z m032, 31, 32, ma000
227 C extrd,u m032, 31, 32, ma064
228 C ldd 0(rp), r000
229 C addib,<> -1, %r5, L(loop0)
230 C depd m096, 31, 32, ma064
231 LDEF(0_out)
232 ldo 8(up), up
233 xmpyu %fr8R, %fr4L, %fr22
234 xmpyu %fr8L, %fr4R, %fr23
235 ldd -0x78(%r30), p032a1
236 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
237 xmpyu %fr8R, %fr4R, %fr24
238 xmpyu %fr8L, %fr4L, %fr25
239 ldd -0x70(%r30), p032a2
240 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
241 ldo 8(rp), rp
242 add climb, p000a, s000
243 ldd -0x80(%r30), p000a
244 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
245 add,dc p064a, %r0, climb
246 ldd -0x68(%r30), p064a
247 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
248 add ma000, s000, s000
249 add,dc ma064, climb, climb
250 add r000, s000, s000
251 add,dc %r0, climb, climb
252 std s000, -8(rp)
253 add p032a1, p032a2, m032
254 add,dc %r0, %r0, m096
255 depd,z m032, 31, 32, ma000
256 extrd,u m032, 31, 32, ma064
257 ldd 0(rp), r000
258 depd m096, 31, 32, ma064
259 LDEF(0_two_out)
260 ldd -0x78(%r30), p032a1
261 ldd -0x70(%r30), p032a2
262 ldo 8(rp), rp
263 add climb, p000a, s000
264 ldd -0x80(%r30), p000a
265 add,dc p064a, %r0, climb
266 ldd -0x68(%r30), p064a
267 add ma000, s000, s000
268 add,dc ma064, climb, climb
269 add r000, s000, s000
270 add,dc %r0, climb, climb
271 std s000, -8(rp)
272 LDEF(0_one_out)
273 add p032a1, p032a2, m032
274 add,dc %r0, %r0, m096
275 depd,z m032, 31, 32, ma000
276 extrd,u m032, 31, 32, ma064
277 ldd 0(rp), r000
278 depd m096, 31, 32, ma064
280 add climb, p000a, s000
281 add,dc p064a, %r0, climb
282 add ma000, s000, s000
283 add,dc ma064, climb, climb
284 add r000, s000, s000
285 add,dc %r0, climb, climb
286 std s000, 0(rp)
288 cmpib,>= 4, n, L(done)
289 ldo 8(rp), rp
291 C 4-way unrolled code.
293 LDEF(BIG)
295 define(`p032a1',`%r1') C
296 define(`p032a2',`%r19') C
297 define(`p096b1',`%r20') C
298 define(`p096b2',`%r21') C
299 define(`p160c1',`%r22') C
300 define(`p160c2',`%r29') C
301 define(`p224d1',`%r31') C
302 define(`p224d2',`%r3') C
304 define(`m032',`%r4') C
305 define(`m096',`%r5') C
306 define(`m160',`%r6') C
307 define(`m224',`%r7') C
308 define(`m288',`%r8') C
310 define(`p000a',`%r1') C
311 define(`p064a',`%r19') C
312 define(`p064b',`%r20') C
313 define(`p128b',`%r21') C
314 define(`p128c',`%r22') C
315 define(`p192c',`%r29') C
316 define(`p192d',`%r31') C
317 define(`p256d',`%r3') C
319 define(`s000',`%r10') C
320 define(`s064',`%r11') C
321 define(`s128',`%r12') C
322 define(`s192',`%r13') C
324 define(`ma000',`%r9') C
325 define(`ma064',`%r4') C
326 define(`ma128',`%r5') C
327 define(`ma192',`%r6') C
328 define(`ma256',`%r7') C
330 define(`r000',`%r1') C
331 define(`r064',`%r19') C
332 define(`r128',`%r20') C
333 define(`r192',`%r21') C
335 std %r6, -0xe8(%r30)
336 std %r7, -0xe0(%r30)
337 std %r8, -0xd8(%r30)
338 std %r9, -0xd0(%r30)
339 std %r10, -0xc8(%r30)
340 std %r11, -0xc0(%r30)
341 std %r12, -0xb8(%r30)
342 std %r13, -0xb0(%r30)
344 ifdef(`HAVE_ABI_2_0w',
345 ` extrd,u n, 61, 62, n C right shift 2
346 ',` extrd,u n, 61, 30, n C right shift 2, zero extend
349 LDEF(4_or_more)
350 fldd 0(up), %fr4
351 fldd 8(up), %fr5
352 fldd 16(up), %fr6
353 fldd 24(up), %fr7
354 xmpyu %fr8R, %fr4L, %fr22
355 xmpyu %fr8L, %fr4R, %fr23
356 xmpyu %fr8R, %fr5L, %fr24
357 xmpyu %fr8L, %fr5R, %fr25
358 xmpyu %fr8R, %fr6L, %fr26
359 xmpyu %fr8L, %fr6R, %fr27
360 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
361 xmpyu %fr8R, %fr7L, %fr28
362 xmpyu %fr8L, %fr7R, %fr29
363 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
364 xmpyu %fr8R, %fr4R, %fr30
365 xmpyu %fr8L, %fr4L, %fr31
366 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
367 xmpyu %fr8R, %fr5R, %fr22
368 xmpyu %fr8L, %fr5L, %fr23
369 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
370 xmpyu %fr8R, %fr6R, %fr24
371 xmpyu %fr8L, %fr6L, %fr25
372 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
373 xmpyu %fr8R, %fr7R, %fr26
374 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
375 addib,<> -1, n, L(8_or_more)
376 xmpyu %fr8L, %fr7L, %fr27
377 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
378 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
379 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
380 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
381 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
382 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
383 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
384 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
385 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
386 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
387 ldd -0x78(%r30), p032a1
388 ldd -0x70(%r30), p032a2
389 ldd -0x38(%r30), p096b1
390 ldd -0x30(%r30), p096b2
391 ldd -0x58(%r30), p160c1
392 ldd -0x50(%r30), p160c2
393 ldd -0x18(%r30), p224d1
394 ldd -0x10(%r30), p224d2
395 b L(end1)
398 LDEF(8_or_more)
399 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
400 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
401 ldo 32(up), up
402 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
403 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
404 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
405 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
406 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
407 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
408 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
409 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
410 fldd 0(up), %fr4
411 fldd 8(up), %fr5
412 fldd 16(up), %fr6
413 fldd 24(up), %fr7
414 xmpyu %fr8R, %fr4L, %fr22
415 ldd -0x78(%r30), p032a1
416 xmpyu %fr8L, %fr4R, %fr23
417 xmpyu %fr8R, %fr5L, %fr24
418 ldd -0x70(%r30), p032a2
419 xmpyu %fr8L, %fr5R, %fr25
420 xmpyu %fr8R, %fr6L, %fr26
421 ldd -0x38(%r30), p096b1
422 xmpyu %fr8L, %fr6R, %fr27
423 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
424 xmpyu %fr8R, %fr7L, %fr28
425 ldd -0x30(%r30), p096b2
426 xmpyu %fr8L, %fr7R, %fr29
427 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
428 xmpyu %fr8R, %fr4R, %fr30
429 ldd -0x58(%r30), p160c1
430 xmpyu %fr8L, %fr4L, %fr31
431 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
432 xmpyu %fr8R, %fr5R, %fr22
433 ldd -0x50(%r30), p160c2
434 xmpyu %fr8L, %fr5L, %fr23
435 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
436 xmpyu %fr8R, %fr6R, %fr24
437 ldd -0x18(%r30), p224d1
438 xmpyu %fr8L, %fr6L, %fr25
439 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
440 xmpyu %fr8R, %fr7R, %fr26
441 ldd -0x10(%r30), p224d2
442 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
443 addib,= -1, n, L(end2)
444 xmpyu %fr8L, %fr7L, %fr27
445 LDEF(loop)
446 add p032a1, p032a2, m032
447 ldd -0x80(%r30), p000a
448 add,dc p096b1, p096b2, m096
449 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
451 add,dc p160c1, p160c2, m160
452 ldd -0x68(%r30), p064a
453 add,dc p224d1, p224d2, m224
454 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
456 add,dc %r0, %r0, m288
457 ldd -0x40(%r30), p064b
458 ldo 32(up), up
459 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
461 depd,z m032, 31, 32, ma000
462 ldd -0x28(%r30), p128b
463 extrd,u m032, 31, 32, ma064
464 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
466 depd m096, 31, 32, ma064
467 ldd -0x60(%r30), p128c
468 extrd,u m096, 31, 32, ma128
469 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
471 depd m160, 31, 32, ma128
472 ldd -0x48(%r30), p192c
473 extrd,u m160, 31, 32, ma192
474 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
476 depd m224, 31, 32, ma192
477 ldd -0x20(%r30), p192d
478 extrd,u m224, 31, 32, ma256
479 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
481 depd m288, 31, 32, ma256
482 ldd -0x88(%r30), p256d
483 add climb, p000a, s000
484 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
486 add,dc p064a, p064b, s064
487 ldd 0(rp), r000
488 add,dc p128b, p128c, s128
489 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
491 add,dc p192c, p192d, s192
492 ldd 8(rp), r064
493 add,dc p256d, %r0, climb
494 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
496 ldd 16(rp), r128
497 add ma000, s000, s000 C accum mid 0
498 ldd 24(rp), r192
499 add,dc ma064, s064, s064 C accum mid 1
501 add,dc ma128, s128, s128 C accum mid 2
502 fldd 0(up), %fr4
503 add,dc ma192, s192, s192 C accum mid 3
504 fldd 8(up), %fr5
506 add,dc ma256, climb, climb
507 fldd 16(up), %fr6
508 add r000, s000, s000 C accum rlimb 0
509 fldd 24(up), %fr7
511 add,dc r064, s064, s064 C accum rlimb 1
512 add,dc r128, s128, s128 C accum rlimb 2
513 std s000, 0(rp)
515 add,dc r192, s192, s192 C accum rlimb 3
516 add,dc %r0, climb, climb
517 std s064, 8(rp)
519 xmpyu %fr8R, %fr4L, %fr22
520 ldd -0x78(%r30), p032a1
521 xmpyu %fr8L, %fr4R, %fr23
522 std s128, 16(rp)
524 xmpyu %fr8R, %fr5L, %fr24
525 ldd -0x70(%r30), p032a2
526 xmpyu %fr8L, %fr5R, %fr25
527 std s192, 24(rp)
529 xmpyu %fr8R, %fr6L, %fr26
530 ldd -0x38(%r30), p096b1
531 xmpyu %fr8L, %fr6R, %fr27
532 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
534 xmpyu %fr8R, %fr7L, %fr28
535 ldd -0x30(%r30), p096b2
536 xmpyu %fr8L, %fr7R, %fr29
537 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
539 xmpyu %fr8R, %fr4R, %fr30
540 ldd -0x58(%r30), p160c1
541 xmpyu %fr8L, %fr4L, %fr31
542 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
544 xmpyu %fr8R, %fr5R, %fr22
545 ldd -0x50(%r30), p160c2
546 xmpyu %fr8L, %fr5L, %fr23
547 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
549 xmpyu %fr8R, %fr6R, %fr24
550 ldd -0x18(%r30), p224d1
551 xmpyu %fr8L, %fr6L, %fr25
552 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
554 xmpyu %fr8R, %fr7R, %fr26
555 ldd -0x10(%r30), p224d2
556 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
557 xmpyu %fr8L, %fr7L, %fr27
559 addib,<> -1, n, L(loop)
560 ldo 32(rp), rp
562 LDEF(end2)
563 add p032a1, p032a2, m032
564 ldd -0x80(%r30), p000a
565 add,dc p096b1, p096b2, m096
566 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
567 add,dc p160c1, p160c2, m160
568 ldd -0x68(%r30), p064a
569 add,dc p224d1, p224d2, m224
570 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
571 add,dc %r0, %r0, m288
572 ldd -0x40(%r30), p064b
573 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
574 depd,z m032, 31, 32, ma000
575 ldd -0x28(%r30), p128b
576 extrd,u m032, 31, 32, ma064
577 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
578 depd m096, 31, 32, ma064
579 ldd -0x60(%r30), p128c
580 extrd,u m096, 31, 32, ma128
581 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
582 depd m160, 31, 32, ma128
583 ldd -0x48(%r30), p192c
584 extrd,u m160, 31, 32, ma192
585 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
586 depd m224, 31, 32, ma192
587 ldd -0x20(%r30), p192d
588 extrd,u m224, 31, 32, ma256
589 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
590 depd m288, 31, 32, ma256
591 ldd -0x88(%r30), p256d
592 add climb, p000a, s000
593 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
594 add,dc p064a, p064b, s064
595 ldd 0(rp), r000
596 add,dc p128b, p128c, s128
597 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
598 add,dc p192c, p192d, s192
599 ldd 8(rp), r064
600 add,dc p256d, %r0, climb
601 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
602 ldd 16(rp), r128
603 add ma000, s000, s000 C accum mid 0
604 ldd 24(rp), r192
605 add,dc ma064, s064, s064 C accum mid 1
606 add,dc ma128, s128, s128 C accum mid 2
607 add,dc ma192, s192, s192 C accum mid 3
608 add,dc ma256, climb, climb
609 add r000, s000, s000 C accum rlimb 0
610 add,dc r064, s064, s064 C accum rlimb 1
611 add,dc r128, s128, s128 C accum rlimb 2
612 std s000, 0(rp)
613 add,dc r192, s192, s192 C accum rlimb 3
614 add,dc %r0, climb, climb
615 std s064, 8(rp)
616 ldd -0x78(%r30), p032a1
617 std s128, 16(rp)
618 ldd -0x70(%r30), p032a2
619 std s192, 24(rp)
620 ldd -0x38(%r30), p096b1
621 ldd -0x30(%r30), p096b2
622 ldd -0x58(%r30), p160c1
623 ldd -0x50(%r30), p160c2
624 ldd -0x18(%r30), p224d1
625 ldd -0x10(%r30), p224d2
626 ldo 32(rp), rp
628 LDEF(end1)
629 add p032a1, p032a2, m032
630 ldd -0x80(%r30), p000a
631 add,dc p096b1, p096b2, m096
632 add,dc p160c1, p160c2, m160
633 ldd -0x68(%r30), p064a
634 add,dc p224d1, p224d2, m224
635 add,dc %r0, %r0, m288
636 ldd -0x40(%r30), p064b
637 depd,z m032, 31, 32, ma000
638 ldd -0x28(%r30), p128b
639 extrd,u m032, 31, 32, ma064
640 depd m096, 31, 32, ma064
641 ldd -0x60(%r30), p128c
642 extrd,u m096, 31, 32, ma128
643 depd m160, 31, 32, ma128
644 ldd -0x48(%r30), p192c
645 extrd,u m160, 31, 32, ma192
646 depd m224, 31, 32, ma192
647 ldd -0x20(%r30), p192d
648 extrd,u m224, 31, 32, ma256
649 depd m288, 31, 32, ma256
650 ldd -0x88(%r30), p256d
651 add climb, p000a, s000
652 add,dc p064a, p064b, s064
653 ldd 0(rp), r000
654 add,dc p128b, p128c, s128
655 add,dc p192c, p192d, s192
656 ldd 8(rp), r064
657 add,dc p256d, %r0, climb
658 ldd 16(rp), r128
659 add ma000, s000, s000 C accum mid 0
660 ldd 24(rp), r192
661 add,dc ma064, s064, s064 C accum mid 1
662 add,dc ma128, s128, s128 C accum mid 2
663 add,dc ma192, s192, s192 C accum mid 3
664 add,dc ma256, climb, climb
665 add r000, s000, s000 C accum rlimb 0
666 add,dc r064, s064, s064 C accum rlimb 1
667 add,dc r128, s128, s128 C accum rlimb 2
668 std s000, 0(rp)
669 add,dc r192, s192, s192 C accum rlimb 3
670 add,dc %r0, climb, climb
671 std s064, 8(rp)
672 std s128, 16(rp)
673 std s192, 24(rp)
675 ldd -0xb0(%r30), %r13
676 ldd -0xb8(%r30), %r12
677 ldd -0xc0(%r30), %r11
678 ldd -0xc8(%r30), %r10
679 ldd -0xd0(%r30), %r9
680 ldd -0xd8(%r30), %r8
681 ldd -0xe0(%r30), %r7
682 ldd -0xe8(%r30), %r6
683 LDEF(done)
684 ifdef(`HAVE_ABI_2_0w',
685 ` copy climb, %r28
686 ',` extrd,u climb, 63, 32, %r29
687 extrd,u climb, 31, 32, %r28
689 ldd -0xf0(%r30), %r5
690 ldd -0xf8(%r30), %r4
691 bve (%r2)
692 ldd,mb -0x100(%r30), %r3
693 EPILOGUE(mpn_addmul_1)