1 dnl PowerPC
-64 mpn_mul_basecase.
3 dnl Copyright
1999-2001, 2003-2006, 2008, 2010 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
40 C * Reduce register usage. At least 4 register less can be used.
41 C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling
42 C would bring us to 9 c/l.
43 C * The bdz insns for b1 and b2 will never branch,
44 C * Align things better, perhaps by moving things like pointer updates from
45 C before to after loops.
55 define(`outer_rp', `r22
')
56 define(`outer_up', `r23
')
59 PROLOGUE(mpn_mul_basecase)
61 C Special code for un <= 2, for efficiency of these important cases,
62 C and since it simplifies the default code.
68 mulld r8, r5, r7 C weight 0
69 mulhdu r9, r5, r7 C weight 1
76 mulld r8, r0, r7 C weight 1
77 mulhdu r10, r0, r7 C weight 2
87 mulld r8, r5, r6 C weight 1
88 mulhdu r11, r5, r6 C weight 2
89 mulld r12, r0, r6 C weight 2
90 mulhdu r0, r0, r6 C weight 3
118 ld v0, 0(vp) C new v limb
122 rldicl. r0, un, 0,62 C r0 = n & 3, set cr0
124 addi un, un, 4 C compute count...
125 srdi un, un, 2 C ...for ctr
126 mtctr un C copy inner loop count into ctr
188 mtctr un C copy inner loop count into ctr
189 addi rp, outer_rp, 24
190 addi up, outer_up, 16
191 addi outer_rp, outer_rp, 8
192 ld v0, 0(vp) C new v limb
217 ALIGN(32) C registers dying
226 mulhdu r10, r26, v0 C 26
228 mulhdu r8, r27, v0 C 27
230 mulhdu r27, r20, v0 C 26
232 mulhdu r26, r21, v0 C 27
234 adde r0, r0, r12 C 0 12
236 adde r24, r24, r10 C 24 10
239 adde r9, r9, r8 C 8 9
240 adde r11, r11, r27 C 27 11
242 addc r0, r0, r28 C 0 28
244 adde r24, r24, r29 C 7 29
246 adde r9, r9, r30 C 9 30
248 adde r11, r11, r31 C 11 31
305 mtctr un C copy inner loop count into ctr
308 addi outer_rp, outer_rp, 8
309 ld v0, 0(vp) C new v limb
319 ALIGN(32) C registers dying
328 mulhdu r10, r26, v0 C 26
330 mulhdu r8, r27, v0 C 27
332 mulhdu r27, r20, v0 C 26
334 mulhdu r26, r21, v0 C 27
336 adde r0, r0, r12 C 0 12
338 adde r24, r24, r10 C 24 10
341 adde r9, r9, r8 C 8 9
342 adde r11, r11, r27 C 27 11
344 addc r0, r0, r28 C 0 28
346 adde r24, r24, r29 C 7 29
348 adde r9, r9, r30 C 9 30
350 adde r11, r11, r31 C 11 31
407 mtctr un C copy inner loop count into ctr
409 addi up, outer_up, -8
410 addi outer_rp, outer_rp, 8
411 ld v0, 0(vp) C new v limb
417 ALIGN(32) C registers dying
426 mulhdu r10, r26, v0 C 26
428 mulhdu r8, r27, v0 C 27
430 mulhdu r27, r20, v0 C 26
432 mulhdu r26, r21, v0 C 27
434 adde r0, r0, r12 C 0 12
436 adde r24, r24, r10 C 24 10
439 adde r9, r9, r8 C 8 9
440 adde r11, r11, r27 C 27 11
442 addc r0, r0, r28 C 0 28
444 adde r24, r24, r29 C 7 29
446 adde r9, r9, r30 C 9 30
448 adde r11, r11, r31 C 11 31
511 mtctr un C copy inner loop count into ctr
512 addi rp, outer_rp, 16
514 addi outer_rp, outer_rp, 8
515 ld v0, 0(vp) C new v limb
533 ALIGN(16) C registers dying
542 mulhdu r10, r26, v0 C 26
544 mulhdu r8, r27, v0 C 27
546 mulhdu r27, r20, v0 C 26
548 mulhdu r26, r21, v0 C 27
550 adde r0, r0, r12 C 0 12
552 adde r24, r24, r10 C 24 10
555 adde r9, r9, r8 C 8 9
556 adde r11, r11, r27 C 27 11
558 addc r0, r0, r28 C 0 28
560 adde r24, r24, r29 C 7 29
562 adde r9, r9, r30 C 9 30
564 adde r11, r11, r31 C 11 31
576 L(ret): ld r31, -8(r1)