beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / alpha / ev6 / mod_1_4.asm
blob82c42ae4ff0e801d6bee2fd41a69d4363d9212c9
1 dnl Alpha mpn_mod_1s_4p
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2009, 2010 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C TODO:
36 C * Optimise. 2.75 c/l should be possible.
37 C * Write a proper mpn_mod_1s_4p_cps. The code below was compiler generated.
38 C * Optimise feed-in code, starting the sw pipeline in switch code.
39 C * Shorten software pipeline. The mul instructions are scheduled too far
40 C from their users. Fixing this will allow us to use fewer registers.
41 C * If we cannot reduce register usage, write perhaps small-n basecase.
42 C * Does this work for PIC?
44 C cycles/limb
45 C EV4: ?
46 C EV5: 23
47 C EV6: 3
49 define(`ap', `r16')
50 define(`n', `r17')
51 define(`pl', `r24')
52 define(`ph', `r25')
53 define(`rl', `r6')
54 define(`rh', `r7')
55 define(`B1modb', `r1')
56 define(`B2modb', `r2')
57 define(`B3modb', `r3')
58 define(`B4modb', `r4')
59 define(`B5modb', `r5')
61 ASM_START()
62 PROLOGUE(mpn_mod_1s_4p)
63 lda r30, -64(r30)
64 stq r9, 8(r30)
65 ldq B1modb, 16(r19)
66 stq r10, 16(r30)
67 ldq B2modb, 24(r19)
68 stq r11, 24(r30)
69 ldq B3modb, 32(r19)
70 stq r12, 32(r30)
71 ldq B4modb, 40(r19)
72 stq r13, 40(r30)
73 ldq B5modb, 48(r19)
74 s8addq n, ap, ap C point ap at vector end
76 and n, 3, r0
77 lda n, -4(n)
78 beq r0, L(b0)
79 lda r6, -2(r0)
80 blt r6, L(b1)
81 beq r6, L(b2)
83 L(b3): ldq r21, -16(ap)
84 ldq r22, -8(ap)
85 ldq r20, -24(ap)
86 mulq r21, B1modb, r8
87 umulh r21, B1modb, r12
88 mulq r22, B2modb, r9
89 umulh r22, B2modb, r13
90 addq r8, r20, pl
91 cmpult pl, r8, r0
92 addq r0, r12, ph
93 addq r9, pl, rl
94 cmpult rl, r9, r0
95 addq r13, ph, ph
96 addq r0, ph, rh
97 lda ap, -56(ap)
98 br L(com)
100 L(b0): ldq r21, -24(ap)
101 ldq r22, -16(ap)
102 ldq r23, -8(ap)
103 ldq r20, -32(ap)
104 mulq r21, B1modb, r8
105 umulh r21, B1modb, r12
106 mulq r22, B2modb, r9
107 umulh r22, B2modb, r13
108 mulq r23, B3modb, r10
109 umulh r23, B3modb, r27
110 addq r8, r20, pl
111 cmpult pl, r8, r0
112 addq r0, r12, ph
113 addq r9, pl, pl
114 cmpult pl, r9, r0
115 addq r13, ph, ph
116 addq r0, ph, ph
117 addq r10, pl, rl
118 cmpult rl, r10, r0
119 addq r27, ph, ph
120 addq r0, ph, rh
121 lda ap, -64(ap)
122 br L(com)
124 L(b1): bis r31, r31, rh
125 ldq rl, -8(ap)
126 lda ap, -40(ap)
127 br L(com)
129 L(b2): ldq rh, -8(ap)
130 ldq rl, -16(ap)
131 lda ap, -48(ap)
133 L(com): ble n, L(ed3)
134 ldq r21, 8(ap)
135 ldq r22, 16(ap)
136 ldq r23, 24(ap)
137 ldq r20, 0(ap)
138 lda n, -4(n)
139 lda ap, -32(ap)
140 mulq r21, B1modb, r8
141 umulh r21, B1modb, r12
142 mulq r22, B2modb, r9
143 umulh r22, B2modb, r13
144 mulq r23, B3modb, r10
145 umulh r23, B3modb, r27
146 mulq rl, B4modb, r11
147 umulh rl, B4modb, r28
148 ble n, L(ed2)
150 ALIGN(16)
151 L(top): ldq r21, 8(ap)
152 mulq rh, B5modb, rl
153 addq r8, r20, pl
154 ldq r22, 16(ap)
155 cmpult pl, r8, r0
156 umulh rh, B5modb, rh
157 ldq r23, 24(ap)
158 addq r0, r12, ph
159 addq r9, pl, pl
160 mulq r21, B1modb, r8
161 cmpult pl, r9, r0
162 addq r13, ph, ph
163 umulh r21, B1modb, r12
164 lda ap, -32(ap)
165 addq r0, ph, ph
166 addq r10, pl, pl
167 mulq r22, B2modb, r9
168 cmpult pl, r10, r0
169 addq r27, ph, ph
170 addq r11, pl, pl
171 umulh r22, B2modb, r13
172 addq r0, ph, ph
173 cmpult pl, r11, r0
174 addq r28, ph, ph
175 mulq r23, B3modb, r10
176 ldq r20, 32(ap)
177 addq pl, rl, rl
178 umulh r23, B3modb, r27
179 addq r0, ph, ph
180 cmpult rl, pl, r0
181 mulq rl, B4modb, r11
182 addq ph, rh, rh
183 umulh rl, B4modb, r28
184 addq r0, rh, rh
185 lda n, -4(n)
186 bgt n, L(top)
188 L(ed2): mulq rh, B5modb, rl
189 addq r8, r20, pl
190 umulh rh, B5modb, rh
191 cmpult pl, r8, r0
192 addq r0, r12, ph
193 addq r9, pl, pl
194 cmpult pl, r9, r0
195 addq r13, ph, ph
196 addq r0, ph, ph
197 addq r10, pl, pl
198 cmpult pl, r10, r0
199 addq r27, ph, ph
200 addq r11, pl, pl
201 addq r0, ph, ph
202 cmpult pl, r11, r0
203 addq r28, ph, ph
204 addq pl, rl, rl
205 addq r0, ph, ph
206 cmpult rl, pl, r0
207 addq ph, rh, rh
208 addq r0, rh, rh
210 L(ed3): mulq rh, B1modb, r8
211 umulh rh, B1modb, rh
212 addq r8, rl, rl
213 cmpult rl, r8, r0
214 addq r0, rh, rh
216 ldq r24, 8(r19) C cnt
217 sll rh, r24, rh
218 subq r31, r24, r25
219 srl rl, r25, r2
220 sll rl, r24, rl
221 or r2, rh, rh
223 ldq r23, 0(r19) C bi
224 mulq rh, r23, r8
225 umulh rh, r23, r9
226 addq rh, 1, r7
227 addq r8, rl, r8 C ql
228 cmpult r8, rl, r0
229 addq r9, r7, r9
230 addq r0, r9, r9 C qh
231 mulq r9, r18, r21 C qh * b
232 subq rl, r21, rl
233 cmpult r8, rl, r0 C rl > ql
234 negq r0, r0
235 and r0, r18, r0
236 addq rl, r0, rl
237 cmpule r18, rl, r0 C rl >= b
238 negq r0, r0
239 and r0, r18, r0
240 subq rl, r0, rl
242 srl rl, r24, r0
244 ldq r9, 8(r30)
245 ldq r10, 16(r30)
246 ldq r11, 24(r30)
247 ldq r12, 32(r30)
248 ldq r13, 40(r30)
249 lda r30, 64(r30)
250 ret r31, (r26), 1
251 EPILOGUE()
253 PROLOGUE(mpn_mod_1s_4p_cps,gp)
254 lda r30, -32(r30)
255 stq r26, 0(r30)
256 stq r9, 8(r30)
257 stq r10, 16(r30)
258 stq r11, 24(r30)
259 mov r16, r11
260 LEA( r4, __clz_tab)
261 lda r10, 65(r31)
262 cmpbge r31, r17, r1
263 srl r1, 1, r1
264 xor r1, 127, r1
265 addq r1, r4, r1
266 ldq_u r2, 0(r1)
267 extbl r2, r1, r2
268 s8subq r2, 7, r2
269 srl r17, r2, r3
270 subq r10, r2, r10
271 addq r3, r4, r3
272 ldq_u r1, 0(r3)
273 extbl r1, r3, r1
274 subq r10, r1, r10
275 sll r17, r10, r9
276 mov r9, r16
277 jsr r26, mpn_invert_limb
278 LDGP( r29, 0(r26))
279 subq r31, r10, r2
280 lda r1, 1(r31)
281 sll r1, r10, r1
282 subq r31, r9, r3
283 srl r0, r2, r2
284 ldq r26, 0(r30)
285 bis r2, r1, r2
286 stq r0, 0(r11)
287 stq r10, 8(r11)
288 mulq r2, r3, r2
289 srl r2, r10, r3
290 umulh r2, r0, r1
291 stq r3, 16(r11)
292 mulq r2, r0, r3
293 ornot r31, r1, r1
294 subq r1, r2, r1
295 mulq r1, r9, r1
296 addq r1, r9, r2
297 cmpule r1, r3, r3
298 cmoveq r3, r2, r1
299 srl r1, r10, r3
300 umulh r1, r0, r2
301 stq r3, 24(r11)
302 mulq r1, r0, r3
303 ornot r31, r2, r2
304 subq r2, r1, r2
305 mulq r2, r9, r2
306 addq r2, r9, r1
307 cmpule r2, r3, r3
308 cmoveq r3, r1, r2
309 srl r2, r10, r1
310 umulh r2, r0, r3
311 stq r1, 32(r11)
312 mulq r2, r0, r1
313 ornot r31, r3, r3
314 subq r3, r2, r3
315 mulq r3, r9, r3
316 addq r3, r9, r2
317 cmpule r3, r1, r1
318 cmoveq r1, r2, r3
319 srl r3, r10, r2
320 umulh r3, r0, r1
321 stq r2, 40(r11)
322 mulq r3, r0, r0
323 ornot r31, r1, r1
324 subq r1, r3, r1
325 mulq r1, r9, r1
326 addq r1, r9, r9
327 cmpule r1, r0, r0
328 cmoveq r0, r9, r1
329 ldq r9, 8(r30)
330 srl r1, r10, r1
331 ldq r10, 16(r30)
332 stq r1, 48(r11)
333 ldq r11, 24(r30)
334 lda r30, 32(r30)
335 ret r31, (r26), 1
336 EPILOGUE()