1 dnl Alpha ev6 nails mpn_submul_1.
3 dnl Copyright
2002, 2005, 2006 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
39 C * Reroll loop for 3.75 c/l with current 4-way unrolling.
40 C * The loop is overscheduled wrt loads and wrt multiplies, in particular
42 C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
43 C and would work since the loop structure is really regular.
51 define(`numb_mask',`r6
')
78 define(`NAIL_BITS',`GMP_NAIL_BITS
')
79 define(`NUMB_BITS',`GMP_NUMB_BITS
')
81 dnl This declaration is munged by configure
85 PROLOGUE(mpn_submul_1)
86 sll vl0, NAIL_BITS, vl0
87 lda numb_mask, -1(r31)
88 srl numb_mask, NAIL_BITS, numb_mask
97 L(3m4): ldq ul3, 0(up)
112 srl m3a,NAIL_BITS, t0
116 srl m0a,NAIL_BITS, t0
118 sra acc1,NUMB_BITS, t1
121 L(ge3): ldq ul2, 0(up)
125 srl m3a,NAIL_BITS, t0
133 srl m0a,NAIL_BITS, t0
137 sra acc1,NUMB_BITS, t1
155 srl m2a,NAIL_BITS, t0
161 srl m3a,NAIL_BITS, t0
163 sra acc0,NUMB_BITS, t1
166 L(ge4): ldq rl2, 0(rp)
167 srl m2a,NAIL_BITS, t0
174 srl m3a,NAIL_BITS, t0
179 sra acc0,NUMB_BITS, t1
194 srl m0a,NAIL_BITS, t0
198 srl m1a,NAIL_BITS, t0
200 sra acc0,NUMB_BITS, t1
203 L(ge2): ldq ul2, 0(up)
211 srl m0a,NAIL_BITS, t0
218 srl m1a,NAIL_BITS, t0
224 sra acc0,NUMB_BITS, t1
238 srl m1a,NAIL_BITS, t0
240 and acc1,numb_mask, r28
241 sra acc1,NUMB_BITS, t1
246 L(ge1): ldq ul2, 0(up)
257 srl m1a,NAIL_BITS, t0
266 srl m2a,NAIL_BITS, t0
269 sra acc1,NUMB_BITS, t1
272 L(ge5): ldq ul2, 0(up)
276 L(top): mulq vl0, ul0, m0a C U1
277 addq t0, m0b, acc1 C L0
278 sra acc0,NUMB_BITS, t1 C U0
279 stq r28, -24(rp) C L1
281 L(el2): umulh vl0, ul0, m0b C U1
282 and acc0,numb_mask, r28 C L0
283 subq rl1, acc1, acc1 C U0
287 addq t1, acc1, acc1 C L0
288 srl m2a,NAIL_BITS, t0 C U0
291 mulq vl0, ul1, m1a C U1
292 addq t0, m1b, acc0 C L0
293 sra acc1,NUMB_BITS, t1 C U0
294 stq r28, -16(rp) C L1
296 L(el1): umulh vl0, ul1, m1b C U1
297 and acc1,numb_mask, r28 C L0
298 subq rl2, acc0, acc0 C U0
302 addq t1, acc0, acc0 C L0
303 srl m3a,NAIL_BITS, t0 C U0
306 mulq vl0, ul2, m2a C U1
307 addq t0, m2b, acc1 C L0
308 sra acc0,NUMB_BITS, t1 C U0
311 L(el0): umulh vl0, ul2, m2b C U1
312 and acc0,numb_mask, r28 C L0
313 subq rl3, acc1, acc1 C U0
317 addq t1, acc1, acc1 C L0
318 srl m0a,NAIL_BITS, t0 C U0
321 mulq vl0, ul3, m3a C U1
322 addq t0, m3b, acc0 C L0
323 sra acc1,NUMB_BITS, t1 C U0
326 L(el3): umulh vl0, ul3, m3b C U1
327 and acc1,numb_mask, r28 C L0
328 subq rl0, acc0, acc0 C U0
332 addq t1, acc0, acc0 C L0
333 srl m1a,NAIL_BITS, t0 C U0
341 L(end): mulq vl0, ul0, m0a
343 sra acc0,NUMB_BITS, t1
345 L(ta6): umulh vl0, ul0, m0b
346 and acc0,numb_mask, r28
350 srl m2a,NAIL_BITS, t0
353 sra acc1,NUMB_BITS, t1
355 L(ta5): umulh vl0, ul1, m1b
356 and acc1,numb_mask, r28
360 srl m3a,NAIL_BITS, t0
362 sra acc0,NUMB_BITS, t1
366 L(ta4): and acc0,numb_mask, r28
370 srl m0a,NAIL_BITS, t0
372 sra acc1,NUMB_BITS, t1
376 L(ta3): and acc1,numb_mask, r28
380 srl m1a,NAIL_BITS, t0
382 sra acc0,NUMB_BITS, t1
386 L(ta2): and acc0,numb_mask, r28
389 sra acc1,NUMB_BITS, t1
391 and acc1,numb_mask, r28