beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / powerpc64 / mode64 / sqr_basecase.asm
blobe76bb8878d81527c55928fb94d47930ee4d450e1
1 dnl PowerPC-64 mpn_sqr_basecase.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation,
6 dnl Inc.
8 dnl This file is part of the GNU MP Library.
9 dnl
10 dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 dnl it under the terms of either:
12 dnl
13 dnl * the GNU Lesser General Public License as published by the Free
14 dnl Software Foundation; either version 3 of the License, or (at your
15 dnl option) any later version.
16 dnl
17 dnl or
18 dnl
19 dnl * the GNU General Public License as published by the Free Software
20 dnl Foundation; either version 2 of the License, or (at your option) any
21 dnl later version.
22 dnl
23 dnl or both in parallel, as here.
24 dnl
25 dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 dnl for more details.
29 dnl
30 dnl You should have received copies of the GNU General Public License and the
31 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 dnl see https://www.gnu.org/licenses/.
34 include(`../config.m4')
36 C cycles/limb
37 C POWER3/PPC630 6-18
38 C POWER4/PPC970 8
39 C POWER5 8
40 C POWER6 16.25
41 C POWER7 3.77
43 C NOTES
44 C * This is very crude, cleanup!
45 C * Try to reduce the number of needed live registers.
46 C * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4. The
47 C cost will be more live registers.
48 C * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code
49 C size a lot and speed things up perhaps 25%.
50 C * Use computed goto in order to compress the code.
51 C * Implement a larger final corner.
52 C * Schedule callee-saves register saves into other insns. This could save
53 C about 5 cycles/call. (We cannot analogously optimise the restores, since
54 C the sqr_diag_addlsh1 loop has no wind-down code as currently written.)
55 C * Should the alternating std/adde sequences be split? Some pipelines handle
56 C adde poorly, and might sequentialise all these instructions.
57 C * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for
58 C adjacent integer multiply insns. Except for the multiply insns, the code
59 C was not carefully optimised for POWER6 or any other CPU.
60 C * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop.
62 C INPUT PARAMETERS
63 define(`rp', `r3')
64 define(`up', `r4')
65 define(`n', `r5')
67 define(`rp_outer', `r25')
68 define(`up_outer', `r21')
69 define(`rp_saved', `r22')
70 define(`up_saved', `r23')
71 define(`n_saved', `r24')
73 ASM_START()
74 PROLOGUE(mpn_sqr_basecase)
75 cmpdi cr0, n, 2
76 bge cr0, L(ge2)
77 ld r5, 0(up) C n = 1
78 nop
79 mulld r8, r5, r5 C weight 0
80 mulhdu r9, r5, r5 C weight 1
81 std r8, 0(rp)
82 std r9, 8(rp)
83 blr
84 ALIGN(16)
85 L(ge2): bgt cr0, L(gt2)
86 ld r0, 0(up) C n = 2
87 nop
88 mulld r8, r0, r0 C u0 * u0
89 mulhdu r9, r0, r0 C u0 * u0
90 ld r6, 8(up)
91 mulld r10, r6, r6 C u1 * u1
92 mulhdu r11, r6, r6 C u1 * u1
93 mulld r4, r6, r0 C u1 * u0
94 mulhdu r5, r6, r0 C u1 * u0
95 addc r4, r4, r4
96 adde r5, r5, r5
97 addze r11, r11
98 addc r9, r9, r4
99 adde r10, r10, r5
100 addze r11, r11
101 std r8, 0(rp)
102 std r9, 8(rp)
103 std r10, 16(rp)
104 std r11, 24(rp)
107 ALIGN(16)
108 L(gt2): std r31, -8(r1)
109 std r30, -16(r1)
110 std r29, -24(r1)
111 std r28, -32(r1)
112 std r27, -40(r1)
113 std r26, -48(r1)
114 std r25, -56(r1)
115 std r24, -64(r1)
116 std r23, -72(r1)
117 std r22, -80(r1)
118 std r21, -88(r1)
120 mr rp_saved, rp
121 mr up_saved, up
122 mr n_saved, n
123 mr rp_outer, rp
124 mr up_outer, up
126 rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
127 cmpdi cr6, r0, 2
128 addic r7, n, 2 C compute count...
129 srdi r7, r7, 2 C ...for ctr
130 mtctr r7 C copy count into ctr
131 beq- cr0, L(b0)
132 blt- cr6, L(b1)
133 beq- cr6, L(b2)
135 L(b3): ld r6, 0(up)
136 ld r9, 8(up)
137 ld r27, 16(up)
138 addi up, up, 24
139 li r12, 0 C carry limb
140 bdz L(em3)
142 ALIGN(16)
143 L(tm3): mulld r0, r9, r6
144 mulhdu r26, r9, r6
145 mulld r7, r27, r6
146 mulhdu r8, r27, r6
147 ld r9, 0(up)
148 ld r27, 8(up)
149 adde r0, r0, r12
150 adde r7, r7, r26
151 mulld r26, r9, r6
152 mulhdu r10, r9, r6
153 mulld r11, r27, r6
154 mulhdu r12, r27, r6
155 ld r9, 16(up)
156 ld r27, 24(up)
157 std r0, 8(rp)
158 adde r26, r26, r8
159 std r7, 16(rp)
160 adde r11, r11, r10
161 std r26, 24(rp)
162 addi up, up, 32
163 std r11, 32(rp)
164 addi rp, rp, 32
165 bdnz L(tm3)
167 L(em3): mulld r0, r9, r6
168 mulhdu r26, r9, r6
169 mulld r7, r27, r6
170 mulhdu r8, r27, r6
171 adde r0, r0, r12
172 adde r7, r7, r26
173 std r0, 8(rp)
174 std r7, 16(rp)
175 addze r8, r8
176 std r8, 24(rp)
177 addi n, n, 2
178 b L(outer_loop)
180 L(b0): ld r6, 0(up)
181 ld r27, 8(up)
182 mulld r7, r27, r6
183 mulhdu r12, r27, r6
184 std r7, 8(rp)
185 addi rp, rp, 8
186 ld r9, 16(up)
187 ld r27, 24(up)
188 addi up, up, 32
189 bdz L(em0)
191 ALIGN(16)
192 L(tm0): mulld r0, r9, r6
193 mulhdu r26, r9, r6
194 mulld r7, r27, r6
195 mulhdu r8, r27, r6
196 ld r9, 0(up)
197 ld r27, 8(up)
198 adde r0, r0, r12
199 adde r7, r7, r26
200 mulld r26, r9, r6
201 mulhdu r10, r9, r6
202 mulld r11, r27, r6
203 mulhdu r12, r27, r6
204 ld r9, 16(up)
205 ld r27, 24(up)
206 std r0, 8(rp)
207 adde r26, r26, r8
208 std r7, 16(rp)
209 adde r11, r11, r10
210 std r26, 24(rp)
211 addi up, up, 32
212 std r11, 32(rp)
213 addi rp, rp, 32
214 bdnz L(tm0)
216 L(em0): mulld r0, r9, r6
217 mulhdu r26, r9, r6
218 mulld r7, r27, r6
219 mulhdu r8, r27, r6
220 adde r0, r0, r12
221 adde r7, r7, r26
222 std r0, 8(rp)
223 std r7, 16(rp)
224 addze r8, r8
225 std r8, 24(rp)
226 addi n, n, 2
227 b L(outer_loop_ent_2)
229 L(b1): ld r6, 0(up)
230 ld r9, 8(up)
231 ld r27, 16(up)
232 mulld r0, r9, r6
233 mulhdu r26, r9, r6
234 mulld r7, r27, r6
235 mulhdu r12, r27, r6
236 addc r7, r7, r26
237 std r0, 8(rp)
238 std r7, 16(rp)
239 addi rp, rp, 16
240 ld r9, 24(up)
241 ld r27, 32(up)
242 addi up, up, 40
243 bdz L(em1)
245 ALIGN(16)
246 L(tm1): mulld r0, r9, r6
247 mulhdu r26, r9, r6
248 mulld r7, r27, r6
249 mulhdu r8, r27, r6
250 ld r9, 0(up)
251 ld r27, 8(up)
252 adde r0, r0, r12
253 adde r7, r7, r26
254 mulld r26, r9, r6
255 mulhdu r10, r9, r6
256 mulld r11, r27, r6
257 mulhdu r12, r27, r6
258 ld r9, 16(up)
259 ld r27, 24(up)
260 std r0, 8(rp)
261 adde r26, r26, r8
262 std r7, 16(rp)
263 adde r11, r11, r10
264 std r26, 24(rp)
265 addi up, up, 32
266 std r11, 32(rp)
267 addi rp, rp, 32
268 bdnz L(tm1)
270 L(em1): mulld r0, r9, r6
271 mulhdu r26, r9, r6
272 mulld r7, r27, r6
273 mulhdu r8, r27, r6
274 adde r0, r0, r12
275 adde r7, r7, r26
276 std r0, 8(rp)
277 std r7, 16(rp)
278 addze r8, r8
279 std r8, 24(rp)
280 addi n, n, 2
281 b L(outer_loop_ent_3)
283 L(b2): addi r7, r7, -1 C FIXME
284 mtctr r7 C FIXME
285 ld r6, 0(up)
286 ld r9, 8(up)
287 ld r27, 16(up)
288 mulld r0, r9, r6
289 mulhdu r26, r9, r6
290 mulld r7, r27, r6
291 mulhdu r8, r27, r6
292 ld r9, 24(up)
293 mulld r11, r9, r6
294 mulhdu r10, r9, r6
295 addc r7, r7, r26
296 adde r11, r11, r8
297 addze r12, r10
298 std r0, 8(rp)
299 std r7, 16(rp)
300 std r11, 24(rp)
301 addi rp, rp, 24
302 ld r9, 32(up)
303 ld r27, 40(up)
304 addi up, up, 48
305 bdz L(em2)
307 ALIGN(16)
308 L(tm2): mulld r0, r9, r6
309 mulhdu r26, r9, r6
310 mulld r7, r27, r6
311 mulhdu r8, r27, r6
312 ld r9, 0(up)
313 ld r27, 8(up)
314 adde r0, r0, r12
315 adde r7, r7, r26
316 mulld r26, r9, r6
317 mulhdu r10, r9, r6
318 mulld r11, r27, r6
319 mulhdu r12, r27, r6
320 ld r9, 16(up)
321 ld r27, 24(up)
322 std r0, 8(rp)
323 adde r26, r26, r8
324 std r7, 16(rp)
325 adde r11, r11, r10
326 std r26, 24(rp)
327 addi up, up, 32
328 std r11, 32(rp)
329 addi rp, rp, 32
330 bdnz L(tm2)
332 L(em2): mulld r0, r9, r6
333 mulhdu r26, r9, r6
334 mulld r7, r27, r6
335 mulhdu r8, r27, r6
336 adde r0, r0, r12
337 adde r7, r7, r26
338 std r0, 8(rp)
339 std r7, 16(rp)
340 addze r8, r8
341 std r8, 24(rp)
342 addi n, n, 2
343 b L(outer_loop_ent_0)
346 L(outer_loop):
347 addi n, n, -1
348 addi up_outer, up_outer, 8
349 addi rp_outer, rp_outer, 16
351 mr up, up_outer
352 addi rp, rp_outer, 8
354 srdi r0, n, 2
355 mtctr r0
357 bdz L(outer_end)
359 ld r6, 0(up)
360 ld r9, 8(up)
361 ld r27, 16(up)
362 mulld r0, r9, r6
363 mulhdu r26, r9, r6
364 mulld r7, r27, r6
365 mulhdu r8, r27, r6
366 ld r9, 24(up)
367 ld r28, 0(rp)
368 ld r29, 8(rp)
369 ld r30, 16(rp)
370 mulld r11, r9, r6
371 mulhdu r10, r9, r6
372 addc r7, r7, r26
373 adde r11, r11, r8
374 addze r12, r10
375 addc r0, r0, r28
376 std r0, 0(rp)
377 adde r7, r7, r29
378 std r7, 8(rp)
379 adde r11, r11, r30
380 std r11, 16(rp)
381 addi rp, rp, 24
382 ld r9, 32(up)
383 ld r27, 40(up)
384 addi up, up, 48
385 bdz L(ea1)
387 ALIGN(16)
388 L(ta1): mulld r0, r9, r6
389 mulhdu r26, r9, r6 C 9
390 mulld r7, r27, r6
391 mulhdu r8, r27, r6 C 27
392 ld r9, 0(up)
393 ld r28, 0(rp)
394 ld r27, 8(up)
395 ld r29, 8(rp)
396 adde r0, r0, r12 C 0 12
397 adde r7, r7, r26 C 5 7
398 mulld r26, r9, r6
399 mulhdu r10, r9, r6 C 9
400 mulld r11, r27, r6
401 mulhdu r12, r27, r6 C 27
402 ld r9, 16(up)
403 ld r30, 16(rp)
404 ld r27, 24(up)
405 ld r31, 24(rp)
406 adde r26, r26, r8 C 8 5
407 adde r11, r11, r10 C 10 11
408 addze r12, r12 C 12
409 addc r0, r0, r28 C 0 28
410 std r0, 0(rp) C 0
411 adde r7, r7, r29 C 7 29
412 std r7, 8(rp) C 7
413 adde r26, r26, r30 C 5 30
414 std r26, 16(rp) C 5
415 adde r11, r11, r31 C 11 31
416 std r11, 24(rp) C 11
417 addi up, up, 32
418 addi rp, rp, 32
419 bdnz L(ta1)
421 L(ea1): mulld r0, r9, r6
422 mulhdu r26, r9, r6
423 mulld r7, r27, r6
424 mulhdu r8, r27, r6
425 ld r28, 0(rp)
426 ld r29, 8(rp)
427 adde r0, r0, r12
428 adde r7, r7, r26
429 addze r8, r8
430 addc r0, r0, r28
431 std r0, 0(rp)
432 adde r7, r7, r29
433 std r7, 8(rp)
434 addze r8, r8
435 std r8, 16(rp)
437 L(outer_loop_ent_0):
438 addi n, n, -1
439 addi up_outer, up_outer, 8
440 addi rp_outer, rp_outer, 16
442 mr up, up_outer
443 addi rp, rp_outer, 8
445 srdi r0, n, 2
446 mtctr r0
448 ld r6, 0(up)
449 ld r9, 8(up)
450 ld r27, 16(up)
451 ld r28, 0(rp)
452 ld r29, 8(rp)
453 mulld r0, r9, r6
454 mulhdu r26, r9, r6
455 mulld r7, r27, r6
456 mulhdu r8, r27, r6
457 addc r0, r0, r28
458 adde r7, r7, r26
459 addze r12, r8
460 std r0, 0(rp)
461 adde r7, r7, r29
462 std r7, 8(rp)
463 addi rp, rp, 16
464 ld r9, 24(up)
465 ld r27, 32(up)
466 addi up, up, 40
467 bdz L(ea0)
469 ALIGN(16)
470 L(ta0): mulld r0, r9, r6
471 mulhdu r26, r9, r6 C 9
472 mulld r7, r27, r6
473 mulhdu r8, r27, r6 C 27
474 ld r9, 0(up)
475 ld r28, 0(rp)
476 ld r27, 8(up)
477 ld r29, 8(rp)
478 adde r0, r0, r12 C 0 12
479 adde r7, r7, r26 C 5 7
480 mulld r26, r9, r6
481 mulhdu r10, r9, r6 C 9
482 mulld r11, r27, r6
483 mulhdu r12, r27, r6 C 27
484 ld r9, 16(up)
485 ld r30, 16(rp)
486 ld r27, 24(up)
487 ld r31, 24(rp)
488 adde r26, r26, r8 C 8 5
489 adde r11, r11, r10 C 10 11
490 addze r12, r12 C 12
491 addc r0, r0, r28 C 0 28
492 std r0, 0(rp) C 0
493 adde r7, r7, r29 C 7 29
494 std r7, 8(rp) C 7
495 adde r26, r26, r30 C 5 30
496 std r26, 16(rp) C 5
497 adde r11, r11, r31 C 11 31
498 std r11, 24(rp) C 11
499 addi up, up, 32
500 addi rp, rp, 32
501 bdnz L(ta0)
503 L(ea0): mulld r0, r9, r6
504 mulhdu r26, r9, r6
505 mulld r7, r27, r6
506 mulhdu r8, r27, r6
507 ld r28, 0(rp)
508 ld r29, 8(rp)
509 adde r0, r0, r12
510 adde r7, r7, r26
511 addze r8, r8
512 addc r0, r0, r28
513 std r0, 0(rp)
514 adde r7, r7, r29
515 std r7, 8(rp)
516 addze r8, r8
517 std r8, 16(rp)
519 L(outer_loop_ent_3):
520 addi n, n, -1
521 addi up_outer, up_outer, 8
522 addi rp_outer, rp_outer, 16
524 mr up, up_outer
525 addi rp, rp_outer, 8
527 srdi r0, n, 2
528 mtctr r0
530 ld r6, 0(up)
531 ld r9, 8(up)
532 ld r28, 0(rp)
533 mulld r0, r9, r6
534 mulhdu r12, r9, r6
535 addc r0, r0, r28
536 std r0, 0(rp)
537 addi rp, rp, 8
538 ld r9, 16(up)
539 ld r27, 24(up)
540 addi up, up, 32
541 bdz L(ea3)
543 ALIGN(16)
544 L(ta3): mulld r0, r9, r6
545 mulhdu r26, r9, r6 C 9
546 mulld r7, r27, r6
547 mulhdu r8, r27, r6 C 27
548 ld r9, 0(up)
549 ld r28, 0(rp)
550 ld r27, 8(up)
551 ld r29, 8(rp)
552 adde r0, r0, r12 C 0 12
553 adde r7, r7, r26 C 5 7
554 mulld r26, r9, r6
555 mulhdu r10, r9, r6 C 9
556 mulld r11, r27, r6
557 mulhdu r12, r27, r6 C 27
558 ld r9, 16(up)
559 ld r30, 16(rp)
560 ld r27, 24(up)
561 ld r31, 24(rp)
562 adde r26, r26, r8 C 8 5
563 adde r11, r11, r10 C 10 11
564 addze r12, r12 C 12
565 addc r0, r0, r28 C 0 28
566 std r0, 0(rp) C 0
567 adde r7, r7, r29 C 7 29
568 std r7, 8(rp) C 7
569 adde r26, r26, r30 C 5 30
570 std r26, 16(rp) C 5
571 adde r11, r11, r31 C 11 31
572 std r11, 24(rp) C 11
573 addi up, up, 32
574 addi rp, rp, 32
575 bdnz L(ta3)
577 L(ea3): mulld r0, r9, r6
578 mulhdu r26, r9, r6
579 mulld r7, r27, r6
580 mulhdu r8, r27, r6
581 ld r28, 0(rp)
582 ld r29, 8(rp)
583 adde r0, r0, r12
584 adde r7, r7, r26
585 addze r8, r8
586 addc r0, r0, r28
587 std r0, 0(rp)
588 adde r7, r7, r29
589 std r7, 8(rp)
590 addze r8, r8
591 std r8, 16(rp)
594 L(outer_loop_ent_2):
595 addi n, n, -1
596 addi up_outer, up_outer, 8
597 addi rp_outer, rp_outer, 16
599 mr up, up_outer
600 addi rp, rp_outer, 8
602 srdi r0, n, 2
603 mtctr r0
605 addic r0, r0, 0
606 li r12, 0 C cy_limb = 0
607 ld r6, 0(up)
608 ld r9, 8(up)
609 ld r27, 16(up)
610 bdz L(ea2)
611 addi up, up, 24
613 ALIGN(16)
614 L(ta2): mulld r0, r9, r6
615 mulhdu r26, r9, r6 C 9
616 mulld r7, r27, r6
617 mulhdu r8, r27, r6 C 27
618 ld r9, 0(up)
619 ld r28, 0(rp)
620 ld r27, 8(up)
621 ld r29, 8(rp)
622 adde r0, r0, r12 C 0 12
623 adde r7, r7, r26 C 5 7
624 mulld r26, r9, r6
625 mulhdu r10, r9, r6 C 9
626 mulld r11, r27, r6
627 mulhdu r12, r27, r6 C 27
628 ld r9, 16(up)
629 ld r30, 16(rp)
630 ld r27, 24(up)
631 ld r31, 24(rp)
632 adde r26, r26, r8 C 8 5
633 adde r11, r11, r10 C 10 11
634 addze r12, r12 C 12
635 addc r0, r0, r28 C 0 28
636 std r0, 0(rp) C 0
637 adde r7, r7, r29 C 7 29
638 std r7, 8(rp) C 7
639 adde r26, r26, r30 C 5 30
640 std r26, 16(rp) C 5
641 adde r11, r11, r31 C 11 31
642 std r11, 24(rp) C 11
643 addi up, up, 32
644 addi rp, rp, 32
645 bdnz L(ta2)
647 L(ea2): mulld r0, r9, r6
648 mulhdu r26, r9, r6
649 mulld r7, r27, r6
650 mulhdu r8, r27, r6
651 ld r28, 0(rp)
652 ld r29, 8(rp)
653 adde r0, r0, r12
654 adde r7, r7, r26
655 addze r8, r8
656 addc r0, r0, r28
657 std r0, 0(rp)
658 adde r7, r7, r29
659 std r7, 8(rp)
660 addze r8, r8
661 std r8, 16(rp)
663 b L(outer_loop)
665 L(outer_end):
666 ld r6, 0(up)
667 ld r9, 8(up)
668 ld r11, 0(rp)
669 mulld r0, r9, r6
670 mulhdu r8, r9, r6
671 addc r0, r0, r11
672 std r0, 0(rp)
673 addze r8, r8
674 std r8, 8(rp)
676 define(`rp', `rp_saved')
677 define(`up', `r5')
678 define(`n', `r6')
679 define(`climb', `r0')
681 addi r4, rp_saved, 8
682 mr r5, up_saved
683 mr r6, n_saved
685 rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
686 cmpdi cr6, r0, 2
687 addi n, n, 2 C compute count...
688 srdi n, n, 2 C ...for ctr
689 mtctr n C put loop count into ctr
690 beq cr0, L(xb0)
691 blt cr6, L(xb1)
692 beq cr6, L(xb2)
694 L(xb3): ld r6, 0(up)
695 ld r7, 8(up)
696 ld r12, 16(up)
697 addi up, up, 24
698 mulld r24, r6, r6
699 mulhdu r25, r6, r6
700 mulld r26, r7, r7
701 mulhdu r27, r7, r7
702 mulld r28, r12, r12
703 mulhdu r29, r12, r12
704 ld r10, 8(rp)
705 ld r11, 16(rp)
706 ld r6, 24(rp)
707 ld r7, 32(rp)
708 addc r10, r10, r10
709 adde r11, r11, r11
710 adde r6, r6, r6
711 adde r7, r7, r7
712 addze climb, r29
713 addc r10, r10, r25
714 adde r11, r11, r26
715 adde r6, r6, r27
716 adde r7, r7, r28
717 std r24, 0(rp)
718 std r10, 8(rp)
719 std r11, 16(rp)
720 std r6, 24(rp)
721 std r7, 32(rp)
722 addi rp, rp, 40
723 bdnz L(top)
724 b L(end)
726 L(xb2): ld r6, 0(up)
727 ld r7, 8(up)
728 addi up, up, 16
729 mulld r24, r6, r6
730 mulhdu r25, r6, r6
731 mulld r26, r7, r7
732 mulhdu r27, r7, r7
733 ld r10, 8(rp)
734 ld r11, 16(rp)
735 addc r10, r10, r10
736 adde r11, r11, r11
737 addze climb, r27
738 addc r10, r10, r25
739 adde r11, r11, r26
740 std r24, 0(rp)
741 std r10, 8(rp)
742 std r11, 16(rp)
743 addi rp, rp, 24
744 bdnz L(top)
745 b L(end)
747 L(xb0): ld r6, 0(up)
748 ld r7, 8(up)
749 ld r12, 16(up)
750 ld r23, 24(up)
751 addi up, up, 32
752 mulld r24, r6, r6
753 mulhdu r25, r6, r6
754 mulld r26, r7, r7
755 mulhdu r27, r7, r7
756 mulld r28, r12, r12
757 mulhdu r29, r12, r12
758 mulld r30, r23, r23
759 mulhdu r31, r23, r23
760 ld r10, 8(rp)
761 ld r11, 16(rp)
762 ld r6, 24(rp)
763 ld r7, 32(rp)
764 ld r12, 40(rp)
765 ld r23, 48(rp)
766 addc r10, r10, r10
767 adde r11, r11, r11
768 adde r6, r6, r6
769 adde r7, r7, r7
770 adde r12, r12, r12
771 adde r23, r23, r23
772 addze climb, r31
773 std r24, 0(rp)
774 addc r10, r10, r25
775 std r10, 8(rp)
776 adde r11, r11, r26
777 std r11, 16(rp)
778 adde r6, r6, r27
779 std r6, 24(rp)
780 adde r7, r7, r28
781 std r7, 32(rp)
782 adde r12, r12, r29
783 std r12, 40(rp)
784 adde r23, r23, r30
785 std r23, 48(rp)
786 addi rp, rp, 56
787 bdnz L(top)
788 b L(end)
790 L(xb1): ld r6, 0(up)
791 addi up, up, 8
792 mulld r24, r6, r6
793 mulhdu climb, r6, r6
794 std r24, 0(rp)
795 addic rp, rp, 8 C clear carry as side-effect
797 ALIGN(32)
798 L(top): ld r6, 0(up)
799 ld r7, 8(up)
800 ld r12, 16(up)
801 ld r23, 24(up)
802 addi up, up, 32
803 mulld r24, r6, r6
804 mulhdu r25, r6, r6
805 mulld r26, r7, r7
806 mulhdu r27, r7, r7
807 mulld r28, r12, r12
808 mulhdu r29, r12, r12
809 mulld r30, r23, r23
810 mulhdu r31, r23, r23
811 ld r8, 0(rp)
812 ld r9, 8(rp)
813 adde r8, r8, r8
814 adde r9, r9, r9
815 ld r10, 16(rp)
816 ld r11, 24(rp)
817 adde r10, r10, r10
818 adde r11, r11, r11
819 ld r6, 32(rp)
820 ld r7, 40(rp)
821 adde r6, r6, r6
822 adde r7, r7, r7
823 ld r12, 48(rp)
824 ld r23, 56(rp)
825 adde r12, r12, r12
826 adde r23, r23, r23
827 addze r31, r31
828 addc r8, r8, climb
829 std r8, 0(rp)
830 adde r9, r9, r24
831 std r9, 8(rp)
832 adde r10, r10, r25
833 std r10, 16(rp)
834 adde r11, r11, r26
835 std r11, 24(rp)
836 adde r6, r6, r27
837 std r6, 32(rp)
838 adde r7, r7, r28
839 std r7, 40(rp)
840 adde r12, r12, r29
841 std r12, 48(rp)
842 adde r23, r23, r30
843 std r23, 56(rp)
844 mr climb, r31
845 addi rp, rp, 64
846 bdnz L(top)
848 L(end): addze climb, climb
849 std climb, 0(rp)
851 ld r31, -8(r1)
852 ld r30, -16(r1)
853 ld r29, -24(r1)
854 ld r28, -32(r1)
855 ld r27, -40(r1)
856 ld r26, -48(r1)
857 ld r25, -56(r1)
858 ld r24, -64(r1)
859 ld r23, -72(r1)
860 ld r22, -80(r1)
861 ld r21, -88(r1)
863 EPILOGUE()