1 dnl Alpha ev6 nails mpn_mul_1.
3 dnl Copyright
2002, 2005, 2006 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
39 C * Reroll loop for 3.0 c/l with current 4-way unrolling.
40 C * The loop is overscheduled wrt loads and wrt multiplies, in particular
42 C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
43 C and would work since the loop structure is really regular.
51 define(`numb_mask',`r6
')
78 define(`NAIL_BITS',`GMP_NAIL_BITS
')
79 define(`NUMB_BITS',`GMP_NUMB_BITS
')
81 dnl This declaration is munged by configure
86 sll vl0, NAIL_BITS, vl0
87 lda numb_mask, -1(r31)
88 srl numb_mask, NAIL_BITS, numb_mask
97 L(3m4): ldq ul3, 0(up)
111 srl m3a,NAIL_BITS, t0
113 srl m0a,NAIL_BITS, t0
115 srl acc1,NUMB_BITS, t1
118 L(ge3): ldq ul2, 0(up)
121 srl m3a,NAIL_BITS, t0
127 srl m0a,NAIL_BITS, t0
131 srl acc1,NUMB_BITS, t1
148 srl m2a,NAIL_BITS, t0
152 srl m3a,NAIL_BITS, t0
154 srl acc0,NUMB_BITS, t1
157 L(ge4): srl m2a,NAIL_BITS, t0
162 srl m3a,NAIL_BITS, t0
167 srl acc0,NUMB_BITS, t1
181 srl m0a,NAIL_BITS, t0
183 srl m1a,NAIL_BITS, t0
185 srl acc0,NUMB_BITS, t1
188 L(ge2): ldq ul2, 0(up)
195 srl m0a,NAIL_BITS, t0
200 srl m1a,NAIL_BITS, t0
206 srl acc0,NUMB_BITS, t1
219 srl m1a,NAIL_BITS, t0
221 and acc1,numb_mask, r28
222 srl acc1,NUMB_BITS, t1
227 L(ge1): ldq ul2, 0(up)
237 srl m1a,NAIL_BITS, t0
244 srl m2a,NAIL_BITS, t0
247 srl acc1,NUMB_BITS, t1
250 L(ge5): ldq ul2, 0(up)
254 L(top): mulq vl0, ul0, m0a C U1
255 addq t0, m0b, acc1 C L0
256 srl acc0,NUMB_BITS, t1 C U0
257 stq r28, -24(rp) C L1
259 L(el2): umulh vl0, ul0, m0b C U1
260 and acc0,numb_mask, r28 C L0
265 addq t1, acc1, acc1 C L0
266 srl m2a,NAIL_BITS, t0 C U0
269 mulq vl0, ul1, m1a C U1
270 addq t0, m1b, acc0 C L0
271 srl acc1,NUMB_BITS, t1 C U0
272 stq r28, -16(rp) C L1
274 L(el1): umulh vl0, ul1, m1b C U1
275 and acc1,numb_mask, r28 C L0
280 addq t1, acc0, acc0 C L0
281 srl m3a,NAIL_BITS, t0 C U0
284 mulq vl0, ul2, m2a C U1
285 addq t0, m2b, acc1 C L0
286 srl acc0,NUMB_BITS, t1 C U0
289 L(el0): umulh vl0, ul2, m2b C U1
290 and acc0,numb_mask, r28 C L0
295 addq t1, acc1, acc1 C L0
296 srl m0a,NAIL_BITS, t0 C U0
299 mulq vl0, ul3, m3a C U1
300 addq t0, m3b, acc0 C L0
301 srl acc1,NUMB_BITS, t1 C U0
304 L(el3): umulh vl0, ul3, m3b C U1
305 and acc1,numb_mask, r28 C L0
310 addq t1, acc0, acc0 C L0
311 srl m1a,NAIL_BITS, t0 C U0
319 L(end): mulq vl0, ul0, m0a
321 srl acc0,NUMB_BITS, t1
323 L(ta6): umulh vl0, ul0, m0b
324 and acc0,numb_mask, r28
326 srl m2a,NAIL_BITS, t0
329 srl acc1,NUMB_BITS, t1
331 L(ta5): umulh vl0, ul1, m1b
332 and acc1,numb_mask, r28
334 srl m3a,NAIL_BITS, t0
336 srl acc0,NUMB_BITS, t1
339 L(ta4): and acc0,numb_mask, r28
341 srl m0a,NAIL_BITS, t0
343 srl acc1,NUMB_BITS, t1
347 L(ta3): and acc1,numb_mask, r28
349 srl m1a,NAIL_BITS, t0
351 srl acc0,NUMB_BITS, t1
355 L(ta2): and acc0,numb_mask, r28
357 srl acc1,NUMB_BITS, t1
359 and acc1,numb_mask, r28