beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / alpha / ev6 / nails / mul_1.asm
blobda2ee3d09978d209aa4c460c98b1039ec86e2eef
1 dnl Alpha ev6 nails mpn_mul_1.
3 dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C EV4: 42
35 C EV5: 18
36 C EV6: 3.25
38 C TODO
39 C * Reroll loop for 3.0 c/l with current 4-way unrolling.
40 C * The loop is overscheduled wrt loads and wrt multiplies, in particular
41 C umulh.
42 C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
43 C and would work since the loop structure is really regular.
45 C INPUT PARAMETERS
46 define(`rp',`r16')
47 define(`up',`r17')
48 define(`n', `r18')
49 define(`vl0',`r19')
51 define(`numb_mask',`r6')
53 define(`m0a',`r0')
54 define(`m0b',`r1')
55 define(`m1a',`r2')
56 define(`m1b',`r3')
57 define(`m2a',`r20')
58 define(`m2b',`r21')
59 define(`m3a',`r22')
60 define(`m3b',`r23')
62 define(`acc0',`r25')
63 define(`acc1',`r27')
65 define(`ul0',`r4')
66 define(`ul1',`r5')
67 define(`ul2',`r4')
68 define(`ul3',`r5')
70 define(`rl0',`r24')
71 define(`rl1',`r24')
72 define(`rl2',`r24')
73 define(`rl3',`r24')
75 define(`t0',`r7')
76 define(`t1',`r8')
78 define(`NAIL_BITS',`GMP_NAIL_BITS')
79 define(`NUMB_BITS',`GMP_NUMB_BITS')
81 dnl This declaration is munged by configure
82 NAILS_SUPPORT(1-63)
84 ASM_START()
85 PROLOGUE(mpn_mul_1)
86 sll vl0, NAIL_BITS, vl0
87 lda numb_mask, -1(r31)
88 srl numb_mask, NAIL_BITS, numb_mask
90 and n, 3, r25
91 cmpeq r25, 1, r21
92 bne r21, L(1m4)
93 cmpeq r25, 2, r21
94 bne r21, L(2m4)
95 beq r25, L(0m4)
97 L(3m4): ldq ul3, 0(up)
98 lda n, -4(n)
99 ldq ul0, 8(up)
100 mulq vl0, ul3, m3a
101 umulh vl0, ul3, m3b
102 ldq ul1, 16(up)
103 lda up, 24(up)
104 lda rp, -8(rp)
105 mulq vl0, ul0, m0a
106 umulh vl0, ul0, m0b
107 bge n, L(ge3)
109 mulq vl0, ul1, m1a
110 umulh vl0, ul1, m1b
111 srl m3a,NAIL_BITS, t0
112 addq t0, r31, acc1
113 srl m0a,NAIL_BITS, t0
114 addq t0, m3b, acc0
115 srl acc1,NUMB_BITS, t1
116 br r31, L(ta3)
118 L(ge3): ldq ul2, 0(up)
119 mulq vl0, ul1, m1a
120 umulh vl0, ul1, m1b
121 srl m3a,NAIL_BITS, t0
122 ldq ul3, 8(up)
123 lda n, -4(n)
124 mulq vl0, ul2, m2a
125 addq t0, r31, acc1
126 umulh vl0, ul2, m2b
127 srl m0a,NAIL_BITS, t0
128 ldq ul0, 16(up)
129 mulq vl0, ul3, m3a
130 addq t0, m3b, acc0
131 srl acc1,NUMB_BITS, t1
132 br r31, L(el3)
134 L(0m4): lda n, -8(n)
135 ldq ul2, 0(up)
136 ldq ul3, 8(up)
137 mulq vl0, ul2, m2a
138 umulh vl0, ul2, m2b
139 ldq ul0, 16(up)
140 mulq vl0, ul3, m3a
141 umulh vl0, ul3, m3b
142 ldq ul1, 24(up)
143 lda up, 32(up)
144 mulq vl0, ul0, m0a
145 umulh vl0, ul0, m0b
146 bge n, L(ge4)
148 srl m2a,NAIL_BITS, t0
149 mulq vl0, ul1, m1a
150 addq t0, r31, acc0
151 umulh vl0, ul1, m1b
152 srl m3a,NAIL_BITS, t0
153 addq t0, m2b, acc1
154 srl acc0,NUMB_BITS, t1
155 br r31, L(ta4)
157 L(ge4): srl m2a,NAIL_BITS, t0
158 ldq ul2, 0(up)
159 mulq vl0, ul1, m1a
160 addq t0, r31, acc0
161 umulh vl0, ul1, m1b
162 srl m3a,NAIL_BITS, t0
163 ldq ul3, 8(up)
164 lda n, -4(n)
165 mulq vl0, ul2, m2a
166 addq t0, m2b, acc1
167 srl acc0,NUMB_BITS, t1
168 br r31, L(el0)
170 L(2m4): lda n, -4(n)
171 ldq ul0, 0(up)
172 ldq ul1, 8(up)
173 lda up, 16(up)
174 lda rp, -16(rp)
175 mulq vl0, ul0, m0a
176 umulh vl0, ul0, m0b
177 bge n, L(ge2)
179 mulq vl0, ul1, m1a
180 umulh vl0, ul1, m1b
181 srl m0a,NAIL_BITS, t0
182 addq t0, r31, acc0
183 srl m1a,NAIL_BITS, t0
184 addq t0, m0b, acc1
185 srl acc0,NUMB_BITS, t1
186 br r31, L(ta2)
188 L(ge2): ldq ul2, 0(up)
189 mulq vl0, ul1, m1a
190 umulh vl0, ul1, m1b
191 ldq ul3, 8(up)
192 lda n, -4(n)
193 mulq vl0, ul2, m2a
194 umulh vl0, ul2, m2b
195 srl m0a,NAIL_BITS, t0
196 ldq ul0, 16(up)
197 mulq vl0, ul3, m3a
198 addq t0, r31, acc0
199 umulh vl0, ul3, m3b
200 srl m1a,NAIL_BITS, t0
201 ldq ul1, 24(up)
202 lda up, 32(up)
203 lda rp, 32(rp)
204 mulq vl0, ul0, m0a
205 addq t0, m0b, acc1
206 srl acc0,NUMB_BITS, t1
207 bge n, L(el2)
209 br r31, L(ta6)
211 L(1m4): lda n, -4(n)
212 ldq ul1, 0(up)
213 lda up, 8(up)
214 lda rp, -24(rp)
215 bge n, L(ge1)
217 mulq vl0, ul1, m1a
218 umulh vl0, ul1, m1b
219 srl m1a,NAIL_BITS, t0
220 addq t0, r31, acc1
221 and acc1,numb_mask, r28
222 srl acc1,NUMB_BITS, t1
223 stq r28, 24(rp)
224 addq t1, m1b, r0
225 ret r31, (r26), 1
227 L(ge1): ldq ul2, 0(up)
228 mulq vl0, ul1, m1a
229 umulh vl0, ul1, m1b
230 ldq ul3, 8(up)
231 lda n, -4(n)
232 mulq vl0, ul2, m2a
233 umulh vl0, ul2, m2b
234 ldq ul0, 16(up)
235 mulq vl0, ul3, m3a
236 umulh vl0, ul3, m3b
237 srl m1a,NAIL_BITS, t0
238 ldq ul1, 24(up)
239 lda up, 32(up)
240 lda rp, 32(rp)
241 mulq vl0, ul0, m0a
242 addq t0, r31, acc1
243 umulh vl0, ul0, m0b
244 srl m2a,NAIL_BITS, t0
245 mulq vl0, ul1, m1a
246 addq t0, m1b, acc0
247 srl acc1,NUMB_BITS, t1
248 blt n, L(ta5)
250 L(ge5): ldq ul2, 0(up)
251 br r31, L(el1)
253 ALIGN(16)
254 L(top): mulq vl0, ul0, m0a C U1
255 addq t0, m0b, acc1 C L0
256 srl acc0,NUMB_BITS, t1 C U0
257 stq r28, -24(rp) C L1
259 L(el2): umulh vl0, ul0, m0b C U1
260 and acc0,numb_mask, r28 C L0
261 unop C U0
262 unop C L1
264 unop C U1
265 addq t1, acc1, acc1 C L0
266 srl m2a,NAIL_BITS, t0 C U0
267 ldq ul2, 0(up) C L1
269 mulq vl0, ul1, m1a C U1
270 addq t0, m1b, acc0 C L0
271 srl acc1,NUMB_BITS, t1 C U0
272 stq r28, -16(rp) C L1
274 L(el1): umulh vl0, ul1, m1b C U1
275 and acc1,numb_mask, r28 C L0
276 unop C U0
277 lda n, -4(n) C L1
279 unop C U1
280 addq t1, acc0, acc0 C L0
281 srl m3a,NAIL_BITS, t0 C U0
282 ldq ul3, 8(up) C L1
284 mulq vl0, ul2, m2a C U1
285 addq t0, m2b, acc1 C L0
286 srl acc0,NUMB_BITS, t1 C U0
287 stq r28, -8(rp) C L1
289 L(el0): umulh vl0, ul2, m2b C U1
290 and acc0,numb_mask, r28 C L0
291 unop C U0
292 unop C L1
294 unop C U1
295 addq t1, acc1, acc1 C L0
296 srl m0a,NAIL_BITS, t0 C U0
297 ldq ul0, 16(up) C L1
299 mulq vl0, ul3, m3a C U1
300 addq t0, m3b, acc0 C L0
301 srl acc1,NUMB_BITS, t1 C U0
302 stq r28, 0(rp) C L1
304 L(el3): umulh vl0, ul3, m3b C U1
305 and acc1,numb_mask, r28 C L0
306 unop C U0
307 unop C L1
309 unop C U1
310 addq t1, acc0, acc0 C L0
311 srl m1a,NAIL_BITS, t0 C U0
312 ldq ul1, 24(up) C L1
314 lda up, 32(up) C L0
315 unop C U1
316 lda rp, 32(rp) C L1
317 bge n, L(top) C U0
319 L(end): mulq vl0, ul0, m0a
320 addq t0, m0b, acc1
321 srl acc0,NUMB_BITS, t1
322 stq r28, -24(rp)
323 L(ta6): umulh vl0, ul0, m0b
324 and acc0,numb_mask, r28
325 addq t1, acc1, acc1
326 srl m2a,NAIL_BITS, t0
327 mulq vl0, ul1, m1a
328 addq t0, m1b, acc0
329 srl acc1,NUMB_BITS, t1
330 stq r28, -16(rp)
331 L(ta5): umulh vl0, ul1, m1b
332 and acc1,numb_mask, r28
333 addq t1, acc0, acc0
334 srl m3a,NAIL_BITS, t0
335 addq t0, m2b, acc1
336 srl acc0,NUMB_BITS, t1
337 stq r28, -8(rp)
338 ALIGN(16)
339 L(ta4): and acc0,numb_mask, r28
340 addq t1, acc1, acc1
341 srl m0a,NAIL_BITS, t0
342 addq t0, m3b, acc0
343 srl acc1,NUMB_BITS, t1
344 stq r28, 0(rp)
345 unop
346 ALIGN(16)
347 L(ta3): and acc1,numb_mask, r28
348 addq t1, acc0, acc0
349 srl m1a,NAIL_BITS, t0
350 addq t0, m0b, acc1
351 srl acc0,NUMB_BITS, t1
352 stq r28, 8(rp)
353 unop
354 ALIGN(16)
355 L(ta2): and acc0,numb_mask, r28
356 addq t1, acc1, acc1
357 srl acc1,NUMB_BITS, t1
358 stq r28, 16(rp)
359 and acc1,numb_mask, r28
360 addq t1, m1b, r0
361 stq r28, 24(rp)
362 ret r31, (r26), 1
363 EPILOGUE()
364 ASM_END()