beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / powerpc64 / mode64 / mod_1_4.asm
blob0b7d6bf6997f85fa93417ab1bbf926648fce9499
1 dnl PowerPC-64 mpn_mod_1s_4p
3 dnl Copyright 2010, 2011 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C POWER3/PPC630 ?
35 C POWER4/PPC970 9
36 C POWER5 9
37 C POWER6 13
38 C POWER7 3.5
40 C TODO
41 C * Optimise, in particular the cps function. This was compiler-generated and
42 C then hand optimised.
44 C INPUT PARAMETERS
45 define(`ap', `r3')
46 define(`n', `r4')
47 define(`d', `r5')
48 define(`cps', `r6')
50 ASM_START()
52 EXTERN_FUNC(mpn_invert_limb)
54 PROLOGUE(mpn_mod_1s_4p)
55 std r23, -72(r1)
56 ld r23, 48(cps)
57 std r24, -64(r1)
58 std r25, -56(r1)
59 ld r24, 32(cps)
60 ld r25, 24(cps)
61 std r26, -48(r1)
62 std r27, -40(r1)
63 ld r26, 16(cps)
64 std r28, -32(r1)
65 std r29, -24(r1)
66 std r30, -16(r1)
67 std r31, -8(r1)
68 ld r30, 40(cps)
70 rldicl. r0, n, 0,62
71 sldi r31, n, 3
72 add ap, ap, r31 C make ap point at end of operand
74 cmpdi cr7, r0, 2
75 beq cr0, L(b00)
76 blt cr7, L(b01)
77 beq cr7, L(b10)
79 L(b11): ld r11, -16(ap)
80 ld r9, -8(ap)
81 ld r0, -24(ap)
82 mulhdu r27, r11, r26
83 mulld r8, r11, r26
84 mulhdu r11, r9, r25
85 mulld r9, r9, r25
86 addc r31, r8, r0
87 addze r10, r27
88 addc r0, r9, r31
89 adde r9, r11, r10
90 addi ap, ap, -40
91 b L(6)
93 ALIGN(16)
94 L(b00): ld r11, -24(ap)
95 ld r10, -16(ap)
96 ld r9, -8(ap)
97 ld r0, -32(ap)
98 mulld r8, r11, r26
99 mulhdu r7, r10, r25
100 mulhdu r27, r11, r26
101 mulhdu r11, r9, r24
102 mulld r10, r10, r25
103 mulld r9, r9, r24
104 addc r31, r8, r0
105 addze r0, r27
106 addc r8, r31, r10
107 adde r10, r0, r7
108 addc r0, r9, r8
109 adde r9, r11, r10
110 addi ap, ap, -48
111 b L(6)
113 ALIGN(16)
114 L(b01): li r9, 0
115 ld r0, -8(ap)
116 addi ap, ap, -24
117 b L(6)
119 ALIGN(16)
120 L(b10): ld r9, -8(ap)
121 ld r0, -16(ap)
122 addi ap, ap, -32
124 ALIGN(16)
125 L(6): addi r10, n, 3
126 srdi r7, r10, 2
127 mtctr r7
128 bdz L(end)
130 ALIGN(16)
131 L(top): ld r31, -16(ap)
132 ld r10, -8(ap)
133 ld r11, 8(ap)
134 ld r12, 0(ap)
135 mulld r29, r0, r30 C rl * B4modb
136 mulhdu r0, r0, r30 C rl * B4modb
137 mulhdu r27, r10, r26
138 mulld r10, r10, r26
139 mulhdu r7, r9, r23 C rh * B5modb
140 mulld r9, r9, r23 C rh * B5modb
141 mulhdu r28, r11, r24
142 mulld r11, r11, r24
143 mulhdu r4, r12, r25
144 mulld r12, r12, r25
145 addc r8, r10, r31
146 addze r10, r27
147 addi ap, ap, -32
148 addc r27, r8, r12
149 adde r12, r10, r4
150 addc r11, r27, r11
151 adde r31, r12, r28
152 addc r12, r11, r29
153 adde r4, r31, r0
154 addc r0, r9, r12
155 adde r9, r7, r4
156 bdnz L(top)
158 L(end):
159 ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
160 ` lwz r3, 8(cps)',
161 ` lwz r3, 12(cps)')
162 mulld r10, r9, r26
163 mulhdu r9, r9, r26
164 addc r11, r0, r10
165 addze r9, r9
166 ld r10, 0(cps)
167 subfic r8, r3, 64
168 sld r9, r9, r3
169 srd r8, r11, r8
170 sld r11, r11, r3
171 or r9, r8, r9
172 mulld r0, r9, r10
173 mulhdu r10, r9, r10
174 addi r9, r9, 1
175 addc r8, r0, r11
176 adde r0, r10, r9
177 mulld r0, r0, d
178 subf r0, r0, r11
179 cmpld cr7, r8, r0
180 bge cr7, L(9)
181 add r0, r0, d
182 L(9): cmpld cr7, r0, d
183 bge- cr7, L(16)
184 L(10): srd r3, r0, r3
185 ld r23, -72(r1)
186 ld r24, -64(r1)
187 ld r25, -56(r1)
188 ld r26, -48(r1)
189 ld r27, -40(r1)
190 ld r28, -32(r1)
191 ld r29, -24(r1)
192 ld r30, -16(r1)
193 ld r31, -8(r1)
196 L(16): subf r0, d, r0
197 b L(10)
198 EPILOGUE()
200 PROLOGUE(mpn_mod_1s_4p_cps,toc)
201 mflr r0
202 std r29, -24(r1)
203 std r30, -16(r1)
204 mr r29, r3
205 std r0, 16(r1)
206 std r31, -8(r1)
207 stdu r1, -144(r1)
208 cntlzd r31, r4
209 sld r30, r4, r31
210 mr r3, r30
211 CALL( mpn_invert_limb)
212 subfic r9, r31, 64
213 li r10, 1
214 sld r10, r10, r31
215 srd r9, r3, r9
216 neg r0, r30
217 or r10, r10, r9
218 mulld r10, r10, r0
219 mulhdu r11, r10, r3
220 nor r11, r11, r11
221 subf r11, r10, r11
222 mulld r11, r11, r30
223 mulld r0, r10, r3
224 cmpld cr7, r0, r11
225 bge cr7, L(18)
226 add r11, r11, r30
227 L(18): mulhdu r9, r11, r3
228 add r9, r11, r9
229 nor r9, r9, r9
230 mulld r9, r9, r30
231 mulld r0, r11, r3
232 cmpld cr7, r0, r9
233 bge cr7, L(19)
234 add r9, r9, r30
235 L(19): mulhdu r0, r9, r3
236 add r0, r9, r0
237 nor r0, r0, r0
238 mulld r0, r0, r30
239 mulld r8, r9, r3
240 cmpld cr7, r8, r0
241 bge cr7, L(20)
242 add r0, r0, r30
243 L(20): mulhdu r8, r0, r3
244 add r8, r0, r8
245 nor r8, r8, r8
246 mulld r8, r8, r30
247 mulld r7, r0, r3
248 cmpld cr7, r7, r8
249 bge cr7, L(21)
250 add r8, r8, r30
251 L(21): srd r0, r0, r31
252 addi r1, r1, 144
253 srd r8, r8, r31
254 srd r10, r10, r31
255 srd r11, r11, r31
256 std r0, 40(r29)
257 std r31, 8(r29)
258 srd r9, r9, r31
259 ld r0, 16(r1)
260 ld r30, -16(r1)
261 std r8, 48(r29)
262 std r3, 0(r29)
263 mtlr r0
264 ld r31, -8(r1)
265 std r10, 16(r29)
266 std r11, 24(r29)
267 std r9, 32(r29)
268 ld r29, -24(r1)
270 EPILOGUE()