beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / alpha / ev6 / nails / submul_1.asm
blobf473a59ba824bdb67715e87f2061345047e76659
1 dnl Alpha ev6 nails mpn_submul_1.
3 dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C EV4: 42
35 C EV5: 18
36 C EV6: 4
38 C TODO
39 C * Reroll loop for 3.75 c/l with current 4-way unrolling.
40 C * The loop is overscheduled wrt loads and wrt multiplies, in particular
41 C umulh.
42 C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
43 C and would work since the loop structure is really regular.
45 C INPUT PARAMETERS
46 define(`rp',`r16')
47 define(`up',`r17')
48 define(`n', `r18')
49 define(`vl0',`r19')
51 define(`numb_mask',`r6')
53 define(`m0a',`r0')
54 define(`m0b',`r1')
55 define(`m1a',`r2')
56 define(`m1b',`r3')
57 define(`m2a',`r20')
58 define(`m2b',`r21')
59 define(`m3a',`r22')
60 define(`m3b',`r23')
62 define(`acc0',`r25')
63 define(`acc1',`r27')
65 define(`ul0',`r4')
66 define(`ul1',`r5')
67 define(`ul2',`r4')
68 define(`ul3',`r5')
70 define(`rl0',`r24')
71 define(`rl1',`r24')
72 define(`rl2',`r24')
73 define(`rl3',`r24')
75 define(`t0',`r7')
76 define(`t1',`r8')
78 define(`NAIL_BITS',`GMP_NAIL_BITS')
79 define(`NUMB_BITS',`GMP_NUMB_BITS')
81 dnl This declaration is munged by configure
82 NAILS_SUPPORT(2-63)
84 ASM_START()
85 PROLOGUE(mpn_submul_1)
86 sll vl0, NAIL_BITS, vl0
87 lda numb_mask, -1(r31)
88 srl numb_mask, NAIL_BITS, numb_mask
90 and n, 3, r25
91 cmpeq r25, 1, r21
92 bne r21, L(1m4)
93 cmpeq r25, 2, r21
94 bne r21, L(2m4)
95 beq r25, L(0m4)
97 L(3m4): ldq ul3, 0(up)
98 lda n, -4(n)
99 ldq ul0, 8(up)
100 mulq vl0, ul3, m3a
101 umulh vl0, ul3, m3b
102 ldq ul1, 16(up)
103 lda up, 24(up)
104 lda rp, -8(rp)
105 mulq vl0, ul0, m0a
106 umulh vl0, ul0, m0b
107 bge n, L(ge3)
109 mulq vl0, ul1, m1a
110 umulh vl0, ul1, m1b
111 ldq rl3, 8(rp)
112 srl m3a,NAIL_BITS, t0
113 addq t0, r31, acc1
114 subq rl3, acc1, acc1
115 ldq rl0, 16(rp)
116 srl m0a,NAIL_BITS, t0
117 addq t0, m3b, acc0
118 sra acc1,NUMB_BITS, t1
119 br r31, L(ta3)
121 L(ge3): ldq ul2, 0(up)
122 mulq vl0, ul1, m1a
123 umulh vl0, ul1, m1b
124 ldq rl3, 8(rp)
125 srl m3a,NAIL_BITS, t0
126 ldq ul3, 8(up)
127 lda n, -4(n)
128 mulq vl0, ul2, m2a
129 addq t0, r31, acc1
130 umulh vl0, ul2, m2b
131 subq rl3, acc1, acc1
132 ldq rl0, 16(rp)
133 srl m0a,NAIL_BITS, t0
134 ldq ul0, 16(up)
135 mulq vl0, ul3, m3a
136 addq t0, m3b, acc0
137 sra acc1,NUMB_BITS, t1
138 br r31, L(el3)
140 L(0m4): lda n, -8(n)
141 ldq ul2, 0(up)
142 ldq ul3, 8(up)
143 mulq vl0, ul2, m2a
144 umulh vl0, ul2, m2b
145 ldq ul0, 16(up)
146 mulq vl0, ul3, m3a
147 umulh vl0, ul3, m3b
148 ldq ul1, 24(up)
149 lda up, 32(up)
150 mulq vl0, ul0, m0a
151 umulh vl0, ul0, m0b
152 bge n, L(ge4)
154 ldq rl2, 0(rp)
155 srl m2a,NAIL_BITS, t0
156 mulq vl0, ul1, m1a
157 addq t0, r31, acc0
158 umulh vl0, ul1, m1b
159 subq rl2, acc0, acc0
160 ldq rl3, 8(rp)
161 srl m3a,NAIL_BITS, t0
162 addq t0, m2b, acc1
163 sra acc0,NUMB_BITS, t1
164 br r31, L(ta4)
166 L(ge4): ldq rl2, 0(rp)
167 srl m2a,NAIL_BITS, t0
168 ldq ul2, 0(up)
169 mulq vl0, ul1, m1a
170 addq t0, r31, acc0
171 umulh vl0, ul1, m1b
172 subq rl2, acc0, acc0
173 ldq rl3, 8(rp)
174 srl m3a,NAIL_BITS, t0
175 ldq ul3, 8(up)
176 lda n, -4(n)
177 mulq vl0, ul2, m2a
178 addq t0, m2b, acc1
179 sra acc0,NUMB_BITS, t1
180 br r31, L(el0)
182 L(2m4): lda n, -4(n)
183 ldq ul0, 0(up)
184 ldq ul1, 8(up)
185 lda up, 16(up)
186 lda rp, -16(rp)
187 mulq vl0, ul0, m0a
188 umulh vl0, ul0, m0b
189 bge n, L(ge2)
191 mulq vl0, ul1, m1a
192 umulh vl0, ul1, m1b
193 ldq rl0, 16(rp)
194 srl m0a,NAIL_BITS, t0
195 addq t0, r31, acc0
196 subq rl0, acc0, acc0
197 ldq rl1, 24(rp)
198 srl m1a,NAIL_BITS, t0
199 addq t0, m0b, acc1
200 sra acc0,NUMB_BITS, t1
201 br r31, L(ta2)
203 L(ge2): ldq ul2, 0(up)
204 mulq vl0, ul1, m1a
205 umulh vl0, ul1, m1b
206 ldq ul3, 8(up)
207 lda n, -4(n)
208 mulq vl0, ul2, m2a
209 umulh vl0, ul2, m2b
210 ldq rl0, 16(rp)
211 srl m0a,NAIL_BITS, t0
212 ldq ul0, 16(up)
213 mulq vl0, ul3, m3a
214 addq t0, r31, acc0
215 umulh vl0, ul3, m3b
216 subq rl0, acc0, acc0
217 ldq rl1, 24(rp)
218 srl m1a,NAIL_BITS, t0
219 ldq ul1, 24(up)
220 lda up, 32(up)
221 lda rp, 32(rp)
222 mulq vl0, ul0, m0a
223 addq t0, m0b, acc1
224 sra acc0,NUMB_BITS, t1
225 bge n, L(el2)
227 br r31, L(ta6)
229 L(1m4): lda n, -4(n)
230 ldq ul1, 0(up)
231 lda up, 8(up)
232 lda rp, -24(rp)
233 bge n, L(ge1)
235 mulq vl0, ul1, m1a
236 umulh vl0, ul1, m1b
237 ldq rl1, 24(rp)
238 srl m1a,NAIL_BITS, t0
239 subq rl1, t0, acc1
240 and acc1,numb_mask, r28
241 sra acc1,NUMB_BITS, t1
242 stq r28, 24(rp)
243 subq m1b, t1, r0
244 ret r31, (r26), 1
246 L(ge1): ldq ul2, 0(up)
247 mulq vl0, ul1, m1a
248 umulh vl0, ul1, m1b
249 ldq ul3, 8(up)
250 lda n, -4(n)
251 mulq vl0, ul2, m2a
252 umulh vl0, ul2, m2b
253 ldq ul0, 16(up)
254 mulq vl0, ul3, m3a
255 umulh vl0, ul3, m3b
256 ldq rl1, 24(rp)
257 srl m1a,NAIL_BITS, t0
258 ldq ul1, 24(up)
259 lda up, 32(up)
260 lda rp, 32(rp)
261 mulq vl0, ul0, m0a
262 addq t0, r31, acc1
263 umulh vl0, ul0, m0b
264 subq rl1, acc1, acc1
265 ldq rl2, 0(rp)
266 srl m2a,NAIL_BITS, t0
267 mulq vl0, ul1, m1a
268 addq t0, m1b, acc0
269 sra acc1,NUMB_BITS, t1
270 blt n, L(ta5)
272 L(ge5): ldq ul2, 0(up)
273 br r31, L(el1)
275 ALIGN(16)
276 L(top): mulq vl0, ul0, m0a C U1
277 addq t0, m0b, acc1 C L0
278 sra acc0,NUMB_BITS, t1 C U0
279 stq r28, -24(rp) C L1
281 L(el2): umulh vl0, ul0, m0b C U1
282 and acc0,numb_mask, r28 C L0
283 subq rl1, acc1, acc1 C U0
284 ldq rl2, 0(rp) C L1
286 unop C U1
287 addq t1, acc1, acc1 C L0
288 srl m2a,NAIL_BITS, t0 C U0
289 ldq ul2, 0(up) C L1
291 mulq vl0, ul1, m1a C U1
292 addq t0, m1b, acc0 C L0
293 sra acc1,NUMB_BITS, t1 C U0
294 stq r28, -16(rp) C L1
296 L(el1): umulh vl0, ul1, m1b C U1
297 and acc1,numb_mask, r28 C L0
298 subq rl2, acc0, acc0 C U0
299 ldq rl3, 8(rp) C L1
301 lda n, -4(n) C L1
302 addq t1, acc0, acc0 C L0
303 srl m3a,NAIL_BITS, t0 C U0
304 ldq ul3, 8(up) C L1
306 mulq vl0, ul2, m2a C U1
307 addq t0, m2b, acc1 C L0
308 sra acc0,NUMB_BITS, t1 C U0
309 stq r28, -8(rp) C L1
311 L(el0): umulh vl0, ul2, m2b C U1
312 and acc0,numb_mask, r28 C L0
313 subq rl3, acc1, acc1 C U0
314 ldq rl0, 16(rp) C L1
316 unop C U1
317 addq t1, acc1, acc1 C L0
318 srl m0a,NAIL_BITS, t0 C U0
319 ldq ul0, 16(up) C L1
321 mulq vl0, ul3, m3a C U1
322 addq t0, m3b, acc0 C L0
323 sra acc1,NUMB_BITS, t1 C U0
324 stq r28, 0(rp) C L1
326 L(el3): umulh vl0, ul3, m3b C U1
327 and acc1,numb_mask, r28 C L0
328 subq rl0, acc0, acc0 C U0
329 ldq rl1, 24(rp) C L1
331 unop C U1
332 addq t1, acc0, acc0 C L0
333 srl m1a,NAIL_BITS, t0 C U0
334 ldq ul1, 24(up) C L1
336 lda up, 32(up) C L0
337 unop C U1
338 lda rp, 32(rp) C L1
339 bge n, L(top) C U0
341 L(end): mulq vl0, ul0, m0a
342 addq t0, m0b, acc1
343 sra acc0,NUMB_BITS, t1
344 stq r28, -24(rp)
345 L(ta6): umulh vl0, ul0, m0b
346 and acc0,numb_mask, r28
347 subq rl1, acc1, acc1
348 ldq rl2, 0(rp)
349 addq t1, acc1, acc1
350 srl m2a,NAIL_BITS, t0
351 mulq vl0, ul1, m1a
352 addq t0, m1b, acc0
353 sra acc1,NUMB_BITS, t1
354 stq r28, -16(rp)
355 L(ta5): umulh vl0, ul1, m1b
356 and acc1,numb_mask, r28
357 subq rl2, acc0, acc0
358 ldq rl3, 8(rp)
359 addq t1, acc0, acc0
360 srl m3a,NAIL_BITS, t0
361 addq t0, m2b, acc1
362 sra acc0,NUMB_BITS, t1
363 stq r28, -8(rp)
364 unop
365 ALIGN(16)
366 L(ta4): and acc0,numb_mask, r28
367 subq rl3, acc1, acc1
368 ldq rl0, 16(rp)
369 addq t1, acc1, acc1
370 srl m0a,NAIL_BITS, t0
371 addq t0, m3b, acc0
372 sra acc1,NUMB_BITS, t1
373 stq r28, 0(rp)
374 unop
375 ALIGN(16)
376 L(ta3): and acc1,numb_mask, r28
377 subq rl0, acc0, acc0
378 ldq rl1, 24(rp)
379 addq t1, acc0, acc0
380 srl m1a,NAIL_BITS, t0
381 addq t0, m0b, acc1
382 sra acc0,NUMB_BITS, t1
383 stq r28, 8(rp)
384 unop
385 ALIGN(16)
386 L(ta2): and acc0,numb_mask, r28
387 subq rl1, acc1, acc1
388 addq t1, acc1, acc1
389 sra acc1,NUMB_BITS, t1
390 stq r28, 16(rp)
391 and acc1,numb_mask, r28
392 subq m1b, t1, r0
393 stq r28, 24(rp)
394 ret r31, (r26), 1
395 EPILOGUE()
396 ASM_END()