beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / pa64 / mul_1.asm
blob6935c23ccd7cfb83080b63256cd674676f8a2d50
1 dnl HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2 dnl the result in a second limb vector.
4 dnl Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C cycles/limb
35 C 8000,8200: 6.5
36 C 8500,8600,8700: 5.625
38 C The feed-in and wind-down code has not yet been scheduled. Many cycles
39 C could be saved there per call.
41 C DESCRIPTION:
42 C The main loop "BIG" is 4-way unrolled, mainly to allow
43 C effective use of ADD,DC. Delays in moving data via the cache from the FP
44 C registers to the IU registers, have demanded a deep software pipeline, and
45 C a lot of stack slots for partial products in flight.
47 C CODE STRUCTURE:
48 C save-some-registers
49 C do 0, 1, 2, or 3 limbs
50 C if done, restore-some-regs and return
51 C save-many-regs
52 C do 4, 8, ... limb
53 C restore-all-regs
55 C STACK LAYOUT:
56 C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
57 C slots marked FREE, as well as some slots in the caller's "frame marker".
59 C -00 <- r30
60 C -08 FREE
61 C -10 tmp
62 C -18 tmp
63 C -20 tmp
64 C -28 tmp
65 C -30 tmp
66 C -38 tmp
67 C -40 tmp
68 C -48 tmp
69 C -50 tmp
70 C -58 tmp
71 C -60 tmp
72 C -68 tmp
73 C -70 tmp
74 C -78 tmp
75 C -80 tmp
76 C -88 tmp
77 C -90 FREE
78 C -98 FREE
79 C -a0 FREE
80 C -a8 FREE
81 C -b0 r13
82 C -b8 r12
83 C -c0 r11
84 C -c8 r10
85 C -d0 r8
86 C -d8 r8
87 C -e0 r7
88 C -e8 r6
89 C -f0 r5
90 C -f8 r4
91 C -100 r3
92 C Previous frame:
93 C [unused area]
94 C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
97 include(`../config.m4')
99 C INPUT PARAMETERS:
100 define(`rp',`%r26') C
101 define(`up',`%r25') C
102 define(`n',`%r24') C
103 define(`vlimb',`%r23') C
105 define(`climb',`%r23') C
107 ifdef(`HAVE_ABI_2_0w',
108 ` .level 2.0w
109 ',` .level 2.0
111 PROLOGUE(mpn_mul_1)
113 ifdef(`HAVE_ABI_2_0w',
114 ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
116 std,ma %r3, 0x100(%r30)
117 std %r4, -0xf8(%r30)
118 std %r5, -0xf0(%r30)
119 ldo 0(%r0), climb C clear climb
120 fldd -0x138(%r30), %fr8 C put vlimb in fp register
122 define(`p032a1',`%r1') C
123 define(`p032a2',`%r19') C
125 define(`m032',`%r20') C
126 define(`m096',`%r21') C
128 define(`p000a',`%r22') C
129 define(`p064a',`%r29') C
131 define(`s000',`%r31') C
133 define(`ma000',`%r4') C
134 define(`ma064',`%r20') C
136 C define(`r000',`%r3') C FIXME don't save r3 for n < 4.
138 extrd,u n, 63, 2, %r5
139 cmpb,= %r5, %r0, L(BIG)
142 fldd 0(up), %fr4
143 ldo 8(up), up
144 xmpyu %fr8R, %fr4L, %fr22
145 xmpyu %fr8L, %fr4R, %fr23
146 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
147 xmpyu %fr8R, %fr4R, %fr24
148 xmpyu %fr8L, %fr4L, %fr25
149 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
150 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
151 addib,<> -1, %r5, L(two_or_more)
152 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
153 LDEF(one)
154 ldd -0x78(%r30), p032a1
155 ldd -0x70(%r30), p032a2
156 ldd -0x80(%r30), p000a
157 b L(0_one_out)
158 ldd -0x68(%r30), p064a
160 LDEF(two_or_more)
161 fldd 0(up), %fr4
162 ldo 8(up), up
163 xmpyu %fr8R, %fr4L, %fr22
164 xmpyu %fr8L, %fr4R, %fr23
165 ldd -0x78(%r30), p032a1
166 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
167 xmpyu %fr8R, %fr4R, %fr24
168 xmpyu %fr8L, %fr4L, %fr25
169 ldd -0x70(%r30), p032a2
170 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
171 ldd -0x80(%r30), p000a
172 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
173 ldd -0x68(%r30), p064a
174 addib,<> -1, %r5, L(three_or_more)
175 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
176 LDEF(two)
177 add p032a1, p032a2, m032
178 add,dc %r0, %r0, m096
179 depd,z m032, 31, 32, ma000
180 extrd,u m032, 31, 32, ma064
181 b L(0_two_out)
182 depd m096, 31, 32, ma064
184 LDEF(three_or_more)
185 fldd 0(up), %fr4
186 add p032a1, p032a2, m032
187 add,dc %r0, %r0, m096
188 depd,z m032, 31, 32, ma000
189 extrd,u m032, 31, 32, ma064
190 C addib,= -1, %r5, L(0_out)
191 depd m096, 31, 32, ma064
192 LDEF(loop0)
193 C xmpyu %fr8R, %fr4L, %fr22
194 C xmpyu %fr8L, %fr4R, %fr23
195 C ldd -0x78(%r30), p032a1
196 C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
198 C xmpyu %fr8R, %fr4R, %fr24
199 C xmpyu %fr8L, %fr4L, %fr25
200 C ldd -0x70(%r30), p032a2
201 C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
203 C ldo 8(rp), rp
204 C add climb, p000a, s000
205 C ldd -0x80(%r30), p000a
206 C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
208 C add,dc p064a, %r0, climb
209 C ldo 8(up), up
210 C ldd -0x68(%r30), p064a
211 C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
213 C add ma000, s000, s000
214 C add,dc ma064, climb, climb
215 C fldd 0(up), %fr4
217 C std s000, -8(rp)
219 C add p032a1, p032a2, m032
220 C add,dc %r0, %r0, m096
222 C depd,z m032, 31, 32, ma000
223 C extrd,u m032, 31, 32, ma064
224 C addib,<> -1, %r5, L(loop0)
225 C depd m096, 31, 32, ma064
226 LDEF(0_out)
227 ldo 8(up), up
228 xmpyu %fr8R, %fr4L, %fr22
229 xmpyu %fr8L, %fr4R, %fr23
230 ldd -0x78(%r30), p032a1
231 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
232 xmpyu %fr8R, %fr4R, %fr24
233 xmpyu %fr8L, %fr4L, %fr25
234 ldd -0x70(%r30), p032a2
235 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
236 ldo 8(rp), rp
237 add climb, p000a, s000
238 ldd -0x80(%r30), p000a
239 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
240 add,dc p064a, %r0, climb
241 ldd -0x68(%r30), p064a
242 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
243 add ma000, s000, s000
244 add,dc ma064, climb, climb
245 std s000, -8(rp)
246 add p032a1, p032a2, m032
247 add,dc %r0, %r0, m096
248 depd,z m032, 31, 32, ma000
249 extrd,u m032, 31, 32, ma064
250 depd m096, 31, 32, ma064
251 LDEF(0_two_out)
252 ldd -0x78(%r30), p032a1
253 ldd -0x70(%r30), p032a2
254 ldo 8(rp), rp
255 add climb, p000a, s000
256 ldd -0x80(%r30), p000a
257 add,dc p064a, %r0, climb
258 ldd -0x68(%r30), p064a
259 add ma000, s000, s000
260 add,dc ma064, climb, climb
261 std s000, -8(rp)
262 LDEF(0_one_out)
263 add p032a1, p032a2, m032
264 add,dc %r0, %r0, m096
265 depd,z m032, 31, 32, ma000
266 extrd,u m032, 31, 32, ma064
267 depd m096, 31, 32, ma064
269 add climb, p000a, s000
270 add,dc p064a, %r0, climb
271 add ma000, s000, s000
272 add,dc ma064, climb, climb
273 std s000, 0(rp)
275 cmpib,>= 4, n, L(done)
276 ldo 8(rp), rp
278 C 4-way unrolled code.
280 LDEF(BIG)
282 define(`p032a1',`%r1') C
283 define(`p032a2',`%r19') C
284 define(`p096b1',`%r20') C
285 define(`p096b2',`%r21') C
286 define(`p160c1',`%r22') C
287 define(`p160c2',`%r29') C
288 define(`p224d1',`%r31') C
289 define(`p224d2',`%r3') C
291 define(`m032',`%r4') C
292 define(`m096',`%r5') C
293 define(`m160',`%r6') C
294 define(`m224',`%r7') C
295 define(`m288',`%r8') C
297 define(`p000a',`%r1') C
298 define(`p064a',`%r19') C
299 define(`p064b',`%r20') C
300 define(`p128b',`%r21') C
301 define(`p128c',`%r22') C
302 define(`p192c',`%r29') C
303 define(`p192d',`%r31') C
304 define(`p256d',`%r3') C
306 define(`s000',`%r10') C
307 define(`s064',`%r11') C
308 define(`s128',`%r12') C
309 define(`s192',`%r13') C
311 define(`ma000',`%r9') C
312 define(`ma064',`%r4') C
313 define(`ma128',`%r5') C
314 define(`ma192',`%r6') C
315 define(`ma256',`%r7') C
317 std %r6, -0xe8(%r30)
318 std %r7, -0xe0(%r30)
319 std %r8, -0xd8(%r30)
320 std %r9, -0xd0(%r30)
321 std %r10, -0xc8(%r30)
322 std %r11, -0xc0(%r30)
323 std %r12, -0xb8(%r30)
324 std %r13, -0xb0(%r30)
326 ifdef(`HAVE_ABI_2_0w',
327 ` extrd,u n, 61, 62, n C right shift 2
328 ',` extrd,u n, 61, 30, n C right shift 2, zero extend
331 LDEF(4_or_more)
332 fldd 0(up), %fr4
333 fldd 8(up), %fr5
334 fldd 16(up), %fr6
335 fldd 24(up), %fr7
336 xmpyu %fr8R, %fr4L, %fr22
337 xmpyu %fr8L, %fr4R, %fr23
338 xmpyu %fr8R, %fr5L, %fr24
339 xmpyu %fr8L, %fr5R, %fr25
340 xmpyu %fr8R, %fr6L, %fr26
341 xmpyu %fr8L, %fr6R, %fr27
342 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
343 xmpyu %fr8R, %fr7L, %fr28
344 xmpyu %fr8L, %fr7R, %fr29
345 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
346 xmpyu %fr8R, %fr4R, %fr30
347 xmpyu %fr8L, %fr4L, %fr31
348 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
349 xmpyu %fr8R, %fr5R, %fr22
350 xmpyu %fr8L, %fr5L, %fr23
351 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
352 xmpyu %fr8R, %fr6R, %fr24
353 xmpyu %fr8L, %fr6L, %fr25
354 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
355 xmpyu %fr8R, %fr7R, %fr26
356 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
357 addib,<> -1, n, L(8_or_more)
358 xmpyu %fr8L, %fr7L, %fr27
359 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
360 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
361 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
362 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
363 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
364 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
365 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
366 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
367 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
368 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
369 ldd -0x78(%r30), p032a1
370 ldd -0x70(%r30), p032a2
371 ldd -0x38(%r30), p096b1
372 ldd -0x30(%r30), p096b2
373 ldd -0x58(%r30), p160c1
374 ldd -0x50(%r30), p160c2
375 ldd -0x18(%r30), p224d1
376 ldd -0x10(%r30), p224d2
377 b L(end1)
380 LDEF(8_or_more)
381 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
382 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
383 ldo 32(up), up
384 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
385 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
386 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
387 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
388 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
389 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
390 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
391 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
392 fldd 0(up), %fr4
393 fldd 8(up), %fr5
394 fldd 16(up), %fr6
395 fldd 24(up), %fr7
396 xmpyu %fr8R, %fr4L, %fr22
397 ldd -0x78(%r30), p032a1
398 xmpyu %fr8L, %fr4R, %fr23
399 xmpyu %fr8R, %fr5L, %fr24
400 ldd -0x70(%r30), p032a2
401 xmpyu %fr8L, %fr5R, %fr25
402 xmpyu %fr8R, %fr6L, %fr26
403 ldd -0x38(%r30), p096b1
404 xmpyu %fr8L, %fr6R, %fr27
405 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
406 xmpyu %fr8R, %fr7L, %fr28
407 ldd -0x30(%r30), p096b2
408 xmpyu %fr8L, %fr7R, %fr29
409 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
410 xmpyu %fr8R, %fr4R, %fr30
411 ldd -0x58(%r30), p160c1
412 xmpyu %fr8L, %fr4L, %fr31
413 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
414 xmpyu %fr8R, %fr5R, %fr22
415 ldd -0x50(%r30), p160c2
416 xmpyu %fr8L, %fr5L, %fr23
417 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
418 xmpyu %fr8R, %fr6R, %fr24
419 ldd -0x18(%r30), p224d1
420 xmpyu %fr8L, %fr6L, %fr25
421 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
422 xmpyu %fr8R, %fr7R, %fr26
423 ldd -0x10(%r30), p224d2
424 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
425 addib,= -1, n, L(end2)
426 xmpyu %fr8L, %fr7L, %fr27
427 LDEF(loop)
428 add p032a1, p032a2, m032
429 ldd -0x80(%r30), p000a
430 add,dc p096b1, p096b2, m096
431 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
433 add,dc p160c1, p160c2, m160
434 ldd -0x68(%r30), p064a
435 add,dc p224d1, p224d2, m224
436 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
438 add,dc %r0, %r0, m288
439 ldd -0x40(%r30), p064b
440 ldo 32(up), up
441 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
443 depd,z m032, 31, 32, ma000
444 ldd -0x28(%r30), p128b
445 extrd,u m032, 31, 32, ma064
446 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
448 depd m096, 31, 32, ma064
449 ldd -0x60(%r30), p128c
450 extrd,u m096, 31, 32, ma128
451 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
453 depd m160, 31, 32, ma128
454 ldd -0x48(%r30), p192c
455 extrd,u m160, 31, 32, ma192
456 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
458 depd m224, 31, 32, ma192
459 ldd -0x20(%r30), p192d
460 extrd,u m224, 31, 32, ma256
461 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
463 depd m288, 31, 32, ma256
464 ldd -0x88(%r30), p256d
465 add climb, p000a, s000
466 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
468 add,dc p064a, p064b, s064
469 add,dc p128b, p128c, s128
470 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
472 add,dc p192c, p192d, s192
473 add,dc p256d, %r0, climb
474 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
476 add ma000, s000, s000 C accum mid 0
477 fldd 0(up), %fr4
478 add,dc ma064, s064, s064 C accum mid 1
479 std s000, 0(rp)
481 add,dc ma128, s128, s128 C accum mid 2
482 fldd 8(up), %fr5
483 add,dc ma192, s192, s192 C accum mid 3
484 std s064, 8(rp)
486 add,dc ma256, climb, climb
487 fldd 16(up), %fr6
488 std s128, 16(rp)
490 xmpyu %fr8R, %fr4L, %fr22
491 ldd -0x78(%r30), p032a1
492 xmpyu %fr8L, %fr4R, %fr23
493 fldd 24(up), %fr7
495 xmpyu %fr8R, %fr5L, %fr24
496 ldd -0x70(%r30), p032a2
497 xmpyu %fr8L, %fr5R, %fr25
498 std s192, 24(rp)
500 xmpyu %fr8R, %fr6L, %fr26
501 ldd -0x38(%r30), p096b1
502 xmpyu %fr8L, %fr6R, %fr27
503 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
505 xmpyu %fr8R, %fr7L, %fr28
506 ldd -0x30(%r30), p096b2
507 xmpyu %fr8L, %fr7R, %fr29
508 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
510 xmpyu %fr8R, %fr4R, %fr30
511 ldd -0x58(%r30), p160c1
512 xmpyu %fr8L, %fr4L, %fr31
513 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
515 xmpyu %fr8R, %fr5R, %fr22
516 ldd -0x50(%r30), p160c2
517 xmpyu %fr8L, %fr5L, %fr23
518 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
520 xmpyu %fr8R, %fr6R, %fr24
521 ldd -0x18(%r30), p224d1
522 xmpyu %fr8L, %fr6L, %fr25
523 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
525 xmpyu %fr8R, %fr7R, %fr26
526 ldd -0x10(%r30), p224d2
527 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
528 xmpyu %fr8L, %fr7L, %fr27
530 addib,<> -1, n, L(loop)
531 ldo 32(rp), rp
533 LDEF(end2)
534 add p032a1, p032a2, m032
535 ldd -0x80(%r30), p000a
536 add,dc p096b1, p096b2, m096
537 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
538 add,dc p160c1, p160c2, m160
539 ldd -0x68(%r30), p064a
540 add,dc p224d1, p224d2, m224
541 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
542 add,dc %r0, %r0, m288
543 ldd -0x40(%r30), p064b
544 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
545 depd,z m032, 31, 32, ma000
546 ldd -0x28(%r30), p128b
547 extrd,u m032, 31, 32, ma064
548 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
549 depd m096, 31, 32, ma064
550 ldd -0x60(%r30), p128c
551 extrd,u m096, 31, 32, ma128
552 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
553 depd m160, 31, 32, ma128
554 ldd -0x48(%r30), p192c
555 extrd,u m160, 31, 32, ma192
556 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
557 depd m224, 31, 32, ma192
558 ldd -0x20(%r30), p192d
559 extrd,u m224, 31, 32, ma256
560 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
561 depd m288, 31, 32, ma256
562 ldd -0x88(%r30), p256d
563 add climb, p000a, s000
564 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
565 add,dc p064a, p064b, s064
566 add,dc p128b, p128c, s128
567 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
568 add,dc p192c, p192d, s192
569 add,dc p256d, %r0, climb
570 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
571 add ma000, s000, s000 C accum mid 0
572 add,dc ma064, s064, s064 C accum mid 1
573 add,dc ma128, s128, s128 C accum mid 2
574 add,dc ma192, s192, s192 C accum mid 3
575 add,dc ma256, climb, climb
576 std s000, 0(rp)
577 std s064, 8(rp)
578 ldd -0x78(%r30), p032a1
579 std s128, 16(rp)
580 ldd -0x70(%r30), p032a2
581 std s192, 24(rp)
582 ldd -0x38(%r30), p096b1
583 ldd -0x30(%r30), p096b2
584 ldd -0x58(%r30), p160c1
585 ldd -0x50(%r30), p160c2
586 ldd -0x18(%r30), p224d1
587 ldd -0x10(%r30), p224d2
588 ldo 32(rp), rp
590 LDEF(end1)
591 add p032a1, p032a2, m032
592 ldd -0x80(%r30), p000a
593 add,dc p096b1, p096b2, m096
594 add,dc p160c1, p160c2, m160
595 ldd -0x68(%r30), p064a
596 add,dc p224d1, p224d2, m224
597 add,dc %r0, %r0, m288
598 ldd -0x40(%r30), p064b
599 depd,z m032, 31, 32, ma000
600 ldd -0x28(%r30), p128b
601 extrd,u m032, 31, 32, ma064
602 depd m096, 31, 32, ma064
603 ldd -0x60(%r30), p128c
604 extrd,u m096, 31, 32, ma128
605 depd m160, 31, 32, ma128
606 ldd -0x48(%r30), p192c
607 extrd,u m160, 31, 32, ma192
608 depd m224, 31, 32, ma192
609 ldd -0x20(%r30), p192d
610 extrd,u m224, 31, 32, ma256
611 depd m288, 31, 32, ma256
612 ldd -0x88(%r30), p256d
613 add climb, p000a, s000
614 add,dc p064a, p064b, s064
615 add,dc p128b, p128c, s128
616 add,dc p192c, p192d, s192
617 add,dc p256d, %r0, climb
618 add ma000, s000, s000 C accum mid 0
619 add,dc ma064, s064, s064 C accum mid 1
620 add,dc ma128, s128, s128 C accum mid 2
621 add,dc ma192, s192, s192 C accum mid 3
622 add,dc ma256, climb, climb
623 std s000, 0(rp)
624 std s064, 8(rp)
625 std s128, 16(rp)
626 std s192, 24(rp)
628 ldd -0xb0(%r30), %r13
629 ldd -0xb8(%r30), %r12
630 ldd -0xc0(%r30), %r11
631 ldd -0xc8(%r30), %r10
632 ldd -0xd0(%r30), %r9
633 ldd -0xd8(%r30), %r8
634 ldd -0xe0(%r30), %r7
635 ldd -0xe8(%r30), %r6
636 LDEF(done)
637 ifdef(`HAVE_ABI_2_0w',
638 ` copy climb, %r28
639 ',` extrd,u climb, 63, 32, %r29
640 extrd,u climb, 31, 32, %r28
642 ldd -0xf0(%r30), %r5
643 ldd -0xf8(%r30), %r4
644 bve (%r2)
645 ldd,mb -0x100(%r30), %r3
646 EPILOGUE(mpn_mul_1)