beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / powerpc64 / mode64 / mod_1_1.asm
blob873373054f5194bd1175df9fc189da9ed4e7ac97
1 dnl PowerPC-64 mpn_mod_1_1p
3 dnl Copyright 2010, 2011 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C POWER3/PPC630 ?
35 C POWER4/PPC970 17
36 C POWER5 16
37 C POWER6 30
38 C POWER7 10.2
40 C TODO
41 C * Optimise, in particular the cps function. This was compiler-generated and
42 C then hand optimised.
44 C INPUT PARAMETERS
45 define(`ap', `r3')
46 define(`n', `r4')
47 define(`d', `r5')
48 define(`cps', `r6')
50 ASM_START()
52 EXTERN_FUNC(mpn_invert_limb)
54 PROLOGUE(mpn_mod_1_1p)
55 sldi r10, r4, 3
56 addi r4, r4, -1
57 add r3, r3, r10
58 ld r0, 16(r6) C B1modb
59 ld r12, 24(r6) C B2modb
60 ld r9, -8(r3)
61 ld r10, -16(r3)
62 mtctr r4
63 mulhdu r8, r9, r0
64 mulld r7, r9, r0
65 addc r11, r7, r10
66 addze r9, r8
67 bdz L(end)
69 ALIGN(16)
70 L(top): ld r4, -24(r3)
71 addi r3, r3, -8
72 nop
73 mulld r10, r11, r0
74 mulld r8, r9, r12
75 mulhdu r11, r11, r0
76 mulhdu r9, r9, r12
77 addc r7, r10, r4
78 addze r10, r11
79 addc r11, r8, r7
80 adde r9, r9, r10
81 bdnz L(top)
83 L(end):
84 ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
85 ` lwz r0, 8(r6)',
86 ` lwz r0, 12(r6)')
87 ld r3, 0(r6)
88 cmpdi cr7, r0, 0
89 beq- cr7, L(4)
90 subfic r10, r0, 64
91 sld r9, r9, r0
92 srd r10, r11, r10
93 or r9, r10, r9
94 L(4): subfc r10, r5, r9
95 subfe r10, r10, r10
96 nand r10, r10, r10
97 sld r11, r11, r0
98 and r10, r10, r5
99 subf r9, r10, r9
100 mulhdu r10, r9, r3
101 mulld r3, r9, r3
102 addi r9, r9, 1
103 addc r8, r3, r11
104 adde r3, r10, r9
105 mulld r3, r3, r5
106 subf r3, r3, r11
107 cmpld cr7, r8, r3
108 bge cr7, L(5) C FIXME: Make branch-less
109 add r3, r3, r5
110 L(5): cmpld cr7, r3, r5
111 bge- cr7, L(10)
112 srd r3, r3, r0
115 L(10): subf r3, r5, r3
116 srd r3, r3, r0
118 EPILOGUE()
120 PROLOGUE(mpn_mod_1_1p_cps,toc)
121 mflr r0
122 std r29, -24(r1)
123 std r30, -16(r1)
124 std r31, -8(r1)
125 cntlzd r31, r4
126 std r0, 16(r1)
127 extsw r31, r31
128 mr r29, r3
129 stdu r1, -144(r1)
130 sld r30, r4, r31
131 mr r3, r30
132 CALL( mpn_invert_limb)
133 cmpdi cr7, r31, 0
134 neg r0, r30
135 beq- cr7, L(13)
136 subfic r11, r31, 64
137 li r0, 1
138 neg r9, r30
139 srd r11, r3, r11
140 sld r0, r0, r31
141 or r0, r11, r0
142 mulld r0, r0, r9
143 L(13): mulhdu r9, r0, r3
144 mulld r11, r0, r3
145 add r9, r0, r9
146 nor r9, r9, r9
147 mulld r9, r9, r30
148 cmpld cr7, r11, r9
149 bge cr7, L(14)
150 add r9, r9, r30
151 L(14): addi r1, r1, 144
152 srd r0, r0, r31
153 std r31, 8(r29)
154 std r3, 0(r29)
155 std r0, 16(r29)
156 ld r0, 16(r1)
157 srd r9, r9, r31
158 ld r30, -16(r1)
159 ld r31, -8(r1)
160 std r9, 24(r29)
161 ld r29, -24(r1)
162 mtlr r0
164 EPILOGUE()