1 dnl Alpha ev6 mpn_addmul_1
and mpn_submul_1.
3 dnl Copyright
2000, 2003-2005, 2008 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
44 dnl This code was written in cooperation with ev6 pipeline expert Steve Root.
46 dnl The stores can issue a cycle late so we have paired no-op's to
'catch'
47 dnl them
, so that further disturbance to the schedule is damped.
49 dnl We couldn
't pair the loads, because the entangled schedule of the carry's
50 dnl has to happen on one side
{0} of the machine.
52 dnl
This is a great schedule for the d_cache
, a poor schedule for the b_cache.
53 dnl The lockup on U0 means that any stall can
't be recovered from. Consider a
54 dnl ldq in L1, say that load gets stalled because it collides with a fill from
55 dnl the b_cache. On the next cycle, this load gets priority. If first looks
56 dnl at L0, and goes there. The instruction we intended for L0 gets to look at
57 dnl L1, which is NOT where we want it. It either stalls 1, because it can't
58 dnl go
in L0
, or goes there
, and causes a further instruction to stall.
60 dnl So for b_cache
, we
're likely going to want to put one or more cycles back
61 dnl into the code! And, of course, put in lds prefetch for the rp[] operand.
62 dnl At a place where we have an mt followed by a bookkeeping, put the
63 dnl bookkeeping in upper, and the prefetch into lower.
65 dnl Note, the ldq's
and stq
's are at the end of the quadpacks. Note, we'd
66 dnl like
not to have an ldq
or an stq to preceded a conditional branch
in a
67 dnl quadpack. The conditional branch moves the retire pointer one cycle
70 ifdef
(`OPERATION_addmul_1
',`
71 define(`ADDSUB', `addq
')
72 define(`CMPCY', `cmpult
$2,$1')
73 define(`func', `mpn_addmul_1
')
75 ifdef
(`OPERATION_submul_1
',`
76 define(`ADDSUB', `subq
')
77 define(`CMPCY', `cmpult
$1,$2')
78 define(`func', `mpn_submul_1
')
81 MULFUNC_PROLOGUE
(mpn_addmul_1 mpn_submul_1
)
91 $1mod8
: ldq r5
, 0(rp
) C
95 CMPCY
( r5
, r23
), r20 C
101 $
L1: lda r8
, 0(r31
) C zero carry reg
102 lda r24
, 0(r31
) C zero carry reg
116 $7mod8
: ldq r5
, 0(rp
) C
121 CMPCY
( r5
, r23
), r20 C
126 $6mod8
: ldq r1
, 8(up
) C
134 lda up
, 48(up
) C L1 bookkeeping
137 lda rp
, -32(rp
) C L1 bookkeeping
139 ADDSUB r4
, r25
, r25 C lo
+ acc
143 $ent
1: lda up
, 8(up
) C
147 $0mod8
: ldq r1
, 8(up
) C
158 ADDSUB r4
, r2
, r2 C lo
+ acc
163 $3mod8
: ldq r5
, 0(rp
) C
168 CMPCY
( r5
, r23
), r20 C
173 $2mod8
: ldq r1
, 8(up
) C
182 lda up
, 16(up
) C L1 bookkeeping
185 lda rp
, 0(rp
) C L1 bookkeeping
187 ADDSUB r4
, r25
, r25 C lo
+ acc
191 $5mod8
: ldq r5
, 0(rp
) C
196 CMPCY
( r5
, r23
), r20 C
201 $4mod8
: ldq r1
, 8(up
) C
209 lda up
, 32(up
) C L1 bookkeeping
212 lda rp
, 16(rp
) C L1 bookkeeping
214 ADDSUB r4
, r2
, r2 C lo
+ acc
216 CMPCY
( r4
, r2
), r20 C L0 lo
add => carry
217 ADDSUB r2
, r8
, r22 C U0 hi
add => answer
221 bis r31
, r31
, r31 C U1 mt
222 CMPCY
( r2
, r22
), r21 C L0 hi
add => carry
223 addq r6
, r20
, r6 C U0 hi
mul + carry
226 bis r31
, r31
, r31 C U1 mt
227 ADDSUB r5
, r7
, r7 C L0 lo
+ acc
228 addq r6
, r21
, r6 C U0 hi
mul + carry
231 umulh v0
, r1
, r8 C U1
232 CMPCY
( r5
, r7
), r20 C L0 lo
add => carry
233 ADDSUB r7
, r6
, r23 C U0 hi
add => answer
237 CMPCY
( r7
, r23
), r21 C L0 hi
add => carry
238 addq r24
, r20
, r24 C U0 hi
mul + carry
241 umulh v0
, r0
, r6 C U1
242 ADDSUB r4
, r25
, r25 C U0 lo
+ acc
243 stq r22
, -16(rp
) C L0
246 bis r31
, r31
, r31 C L0
st slosh
248 bis r31
, r31
, r31 C L1
st slosh
249 addq r24
, r21
, r24 C U0 hi
mul + carry
251 CMPCY
( r4
, r25
), r20 C L0 lo
add => carry
252 bis r31
, r31
, r31 C U1 mt
253 lda r18
, -8(r18
) C L1 bookkeeping
254 ADDSUB r25
, r24
, r22 C U0 hi
add => answer
256 bis r31
, r31
, r31 C U1 mt
257 CMPCY
( r25
, r22
), r21 C L0 hi
add => carry
258 addq r3
, r20
, r3 C U0 hi
mul + carry
261 bis r31
, r31
, r31 C U1 mt
262 ADDSUB r5
, r28
, r28 C L0 lo
+ acc
263 addq r3
, r21
, r3 C U0 hi
mul + carry
266 umulh v0
, r1
, r24 C U1
267 CMPCY
( r5
, r28
), r20 C L0 lo
add => carry
268 ADDSUB r28
, r3
, r23 C U0 hi
add => answer
271 mulq v0
, r0
, r25 C U1
272 CMPCY
( r28
, r23
), r21 C L0 hi
add => carry
273 addq r8
, r20
, r8 C U0 hi
mul + carry
276 umulh v0
, r0
, r3 C U1
277 ADDSUB r4
, r2
, r2 C U0 lo
+ acc
281 bis r31
, r31
, r31 C L0
st slosh
282 mulq v0
, r1
, r28 C U1
283 bis r31
, r31
, r31 C L1
st slosh
284 addq r8
, r21
, r8 C U0 hi
mul + carry
286 CMPCY
( r4
, r2
), r20 C L0 lo
add => carry
287 bis r31
, r31
, r31 C U1 mt
288 lda up
, 64(up
) C L1 bookkeeping
289 ADDSUB r2
, r8
, r22 C U0 hi
add => answer
291 bis r31
, r31
, r31 C U1 mt
292 CMPCY
( r2
, r22
), r21 C L0 hi
add => carry
293 addq r6
, r20
, r6 C U0 hi
mul + carry
296 bis r31
, r31
, r31 C U1 mt
297 ADDSUB r5
, r7
, r7 C L0 lo
+ acc
298 addq r6
, r21
, r6 C U0 hi
mul + carry
301 umulh v0
, r1
, r8 C U1
302 CMPCY
( r5
, r7
), r20 C L0 lo
add => carry
303 ADDSUB r7
, r6
, r23 C U0 hi
add => answer
307 CMPCY
( r7
, r23
), r21 C L0 hi
add => carry
308 addq r24
, r20
, r24 C U0 hi
mul + carry
311 umulh v0
, r0
, r6 C U1
312 ADDSUB r4
, r25
, r25 C U0 lo
+ acc
316 bis r31
, r31
, r31 C L0
st slosh
318 bis r31
, r31
, r31 C L1
st slosh
319 addq r24
, r21
, r24 C U0 hi
mul + carry
321 CMPCY
( r4
, r25
), r20 C L0 lo
add => carry
322 bis r31
, r31
, r31 C U1 mt
323 lda rp
, 64(rp
) C L1 bookkeeping
324 ADDSUB r25
, r24
, r22 C U0 hi
add => answer
326 bis r31
, r31
, r31 C U1 mt
327 CMPCY
( r25
, r22
), r21 C L0 hi
add => carry
328 addq r3
, r20
, r3 C U0 hi
mul + carry
331 bis r31
, r31
, r31 C U1 mt
332 ADDSUB r5
, r28
, r28 C L0 lo
+ acc
333 addq r3
, r21
, r3 C U0 hi
mul + carry
336 umulh v0
, r1
, r24 C U1
337 CMPCY
( r5
, r28
), r20 C L0 lo
add => carry
338 ADDSUB r28
, r3
, r23 C U0 hi
add => answer
341 mulq v0
, r0
, r25 C U1
342 CMPCY
( r28
, r23
), r21 C L0 hi
add => carry
343 addq r8
, r20
, r8 C U0 hi
mul + carry
346 umulh v0
, r0
, r3 C U1
347 ADDSUB r4
, r2
, r2 C U0 lo
+ acc
348 stq r22
, -32(rp
) C L0
349 stq r23
, -24(rp
) C L1
351 bis r31
, r31
, r31 C L0
st slosh
352 mulq v0
, r1
, r28 C U1
353 bis r31
, r31
, r31 C L1
st slosh
354 addq r8
, r21
, r8 C U0 hi
mul + carry
356 CMPCY
( r4
, r2
), r20 C L0 lo
add => carry
357 ADDSUB r2
, r8
, r22 C U0 hi
add => answer
358 ldl r31
, 256(up
) C prefetch up
[]
359 bgt r18
, $
Loop C U1 bookkeeping
361 $
Lend: CMPCY
( r2
, r22
), r21 C
367 CMPCY
( r5
, r7
), r20 C
369 CMPCY
(r7
, r23
), r21 C
372 ADDSUB r4
, r25
, r25 C
379 $
n23: ldq r4
, 0(rp
) C
382 ADDSUB r4
, r25
, r25 C
383 L
(x
): CMPCY
( r4
, r25
), r20 C
384 ADDSUB r25
, r24
, r22 C
385 CMPCY
( r25
, r22
), r21 C
387 ADDSUB r5
, r28
, r28 C
389 CMPCY
( r5
, r28
), r20 C
390 ADDSUB r28
, r3
, r23 C
391 CMPCY
( r28
, r23
), r21 C