beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / alpha / ev6 / aorsmul_1.asm
blob0e68e6e7adb4ca296449836fe5cc94b9bc59d0f8
1 dnl Alpha ev6 mpn_addmul_1 and mpn_submul_1.
3 dnl Copyright 2000, 2003-2005, 2008 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C EV4: 42
35 C EV5: 18
36 C EV6: 3.5
38 C INPUT PARAMETERS
39 define(`rp', `r16')
40 define(`up', `r17')
41 define(`n', `r18')
42 define(`v0', `r19')
44 dnl This code was written in cooperation with ev6 pipeline expert Steve Root.
46 dnl The stores can issue a cycle late so we have paired no-op's to 'catch'
47 dnl them, so that further disturbance to the schedule is damped.
49 dnl We couldn't pair the loads, because the entangled schedule of the carry's
50 dnl has to happen on one side {0} of the machine.
52 dnl This is a great schedule for the d_cache, a poor schedule for the b_cache.
53 dnl The lockup on U0 means that any stall can't be recovered from. Consider a
54 dnl ldq in L1, say that load gets stalled because it collides with a fill from
55 dnl the b_cache. On the next cycle, this load gets priority. If first looks
56 dnl at L0, and goes there. The instruction we intended for L0 gets to look at
57 dnl L1, which is NOT where we want it. It either stalls 1, because it can't
58 dnl go in L0, or goes there, and causes a further instruction to stall.
60 dnl So for b_cache, we're likely going to want to put one or more cycles back
61 dnl into the code! And, of course, put in lds prefetch for the rp[] operand.
62 dnl At a place where we have an mt followed by a bookkeeping, put the
63 dnl bookkeeping in upper, and the prefetch into lower.
65 dnl Note, the ldq's and stq's are at the end of the quadpacks. Note, we'd
66 dnl like not to have an ldq or an stq to preceded a conditional branch in a
67 dnl quadpack. The conditional branch moves the retire pointer one cycle
68 dnl later.
70 ifdef(`OPERATION_addmul_1',`
71 define(`ADDSUB', `addq')
72 define(`CMPCY', `cmpult $2,$1')
73 define(`func', `mpn_addmul_1')
75 ifdef(`OPERATION_submul_1',`
76 define(`ADDSUB', `subq')
77 define(`CMPCY', `cmpult $1,$2')
78 define(`func', `mpn_submul_1')
81 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
83 ASM_START()
84 PROLOGUE(func)
85 ldq r3, 0(up) C
86 and r18, 7, r20 C
87 lda r18, -9(r18) C
88 cmpeq r20, 1, r21 C
89 beq r21, $L1 C
91 $1mod8: ldq r5, 0(rp) C
92 mulq v0, r3, r7 C
93 umulh v0, r3, r8 C
94 ADDSUB r5, r7, r23 C
95 CMPCY( r5, r23), r20 C
96 addq r8, r20, r0 C
97 stq r23, 0(rp) C
98 bge r18, $ent1 C
99 ret r31, (r26), 1 C
101 $L1: lda r8, 0(r31) C zero carry reg
102 lda r24, 0(r31) C zero carry reg
103 cmpeq r20, 2, r21 C
104 bne r21, $2mod8 C
105 cmpeq r20, 3, r21 C
106 bne r21, $3mod8 C
107 cmpeq r20, 4, r21 C
108 bne r21, $4mod8 C
109 cmpeq r20, 5, r21 C
110 bne r21, $5mod8 C
111 cmpeq r20, 6, r21 C
112 bne r21, $6mod8 C
113 cmpeq r20, 7, r21 C
114 beq r21, $0mod8 C
116 $7mod8: ldq r5, 0(rp) C
117 lda up, 8(up) C
118 mulq v0, r3, r7 C
119 umulh v0, r3, r24 C
120 ADDSUB r5, r7, r23 C
121 CMPCY( r5, r23), r20 C
122 addq r24, r20, r24 C
123 stq r23, 0(rp) C
124 lda rp, 8(rp) C
125 ldq r3, 0(up) C
126 $6mod8: ldq r1, 8(up) C
127 mulq v0, r3, r25 C
128 umulh v0, r3, r3 C
129 mulq v0, r1, r28 C
130 ldq r0, 16(up) C
131 ldq r4, 0(rp) C
132 umulh v0, r1, r8 C
133 ldq r1, 24(up) C
134 lda up, 48(up) C L1 bookkeeping
135 mulq v0, r0, r2 C
136 ldq r5, 8(rp) C
137 lda rp, -32(rp) C L1 bookkeeping
138 umulh v0, r0, r6 C
139 ADDSUB r4, r25, r25 C lo + acc
140 mulq v0, r1, r7 C
141 br r31, $ent6 C
143 $ent1: lda up, 8(up) C
144 lda rp, 8(rp) C
145 lda r8, 0(r0) C
146 ldq r3, 0(up) C
147 $0mod8: ldq r1, 8(up) C
148 mulq v0, r3, r2 C
149 umulh v0, r3, r6 C
150 mulq v0, r1, r7 C
151 ldq r0, 16(up) C
152 ldq r4, 0(rp) C
153 umulh v0, r1, r24 C
154 ldq r1, 24(up) C
155 mulq v0, r0, r25 C
156 ldq r5, 8(rp) C
157 umulh v0, r0, r3 C
158 ADDSUB r4, r2, r2 C lo + acc
159 mulq v0, r1, r28 C
160 lda rp, -16(rp) C
161 br r31, $ent0 C
163 $3mod8: ldq r5, 0(rp) C
164 lda up, 8(up) C
165 mulq v0, r3, r7 C
166 umulh v0, r3, r8 C
167 ADDSUB r5, r7, r23 C
168 CMPCY( r5, r23), r20 C
169 addq r8, r20, r24 C
170 stq r23, 0(rp) C
171 lda rp, 8(rp) C
172 ldq r3, 0(up) C
173 $2mod8: ldq r1, 8(up) C
174 mulq v0, r3, r25 C
175 umulh v0, r3, r3 C
176 mulq v0, r1, r28 C
177 ble r18, $n23 C
178 ldq r0, 16(up) C
179 ldq r4, 0(rp) C
180 umulh v0, r1, r8 C
181 ldq r1, 24(up) C
182 lda up, 16(up) C L1 bookkeeping
183 mulq v0, r0, r2 C
184 ldq r5, 8(rp) C
185 lda rp, 0(rp) C L1 bookkeeping
186 umulh v0, r0, r6 C
187 ADDSUB r4, r25, r25 C lo + acc
188 mulq v0, r1, r7 C
189 br r31, $ent2 C
191 $5mod8: ldq r5, 0(rp) C
192 lda up, 8(up) C
193 mulq v0, r3, r7 C
194 umulh v0, r3, r24 C
195 ADDSUB r5, r7, r23 C
196 CMPCY( r5, r23), r20 C
197 addq r24, r20, r8 C
198 stq r23, 0(rp) C
199 lda rp, 8(rp) C
200 ldq r3, 0(up) C
201 $4mod8: ldq r1, 8(up) C
202 mulq v0, r3, r2 C
203 umulh v0, r3, r6 C
204 mulq v0, r1, r7 C
205 ldq r0, 16(up) C
206 ldq r4, 0(rp) C
207 umulh v0, r1, r24 C
208 ldq r1, 24(up) C
209 lda up, 32(up) C L1 bookkeeping
210 mulq v0, r0, r25 C
211 ldq r5, 8(rp) C
212 lda rp, 16(rp) C L1 bookkeeping
213 umulh v0, r0, r3 C
214 ADDSUB r4, r2, r2 C lo + acc
215 mulq v0, r1, r28 C
216 CMPCY( r4, r2), r20 C L0 lo add => carry
217 ADDSUB r2, r8, r22 C U0 hi add => answer
218 ble r18, $Lend C
219 ALIGN(16)
220 $Loop:
221 bis r31, r31, r31 C U1 mt
222 CMPCY( r2, r22), r21 C L0 hi add => carry
223 addq r6, r20, r6 C U0 hi mul + carry
224 ldq r0, 0(up) C
226 bis r31, r31, r31 C U1 mt
227 ADDSUB r5, r7, r7 C L0 lo + acc
228 addq r6, r21, r6 C U0 hi mul + carry
229 ldq r4, 0(rp) C L1
231 umulh v0, r1, r8 C U1
232 CMPCY( r5, r7), r20 C L0 lo add => carry
233 ADDSUB r7, r6, r23 C U0 hi add => answer
234 ldq r1, 8(up) C L1
236 mulq v0, r0, r2 C U1
237 CMPCY( r7, r23), r21 C L0 hi add => carry
238 addq r24, r20, r24 C U0 hi mul + carry
239 ldq r5, 8(rp) C L1
241 umulh v0, r0, r6 C U1
242 ADDSUB r4, r25, r25 C U0 lo + acc
243 stq r22, -16(rp) C L0
244 stq r23, -8(rp) C L1
246 bis r31, r31, r31 C L0 st slosh
247 mulq v0, r1, r7 C U1
248 bis r31, r31, r31 C L1 st slosh
249 addq r24, r21, r24 C U0 hi mul + carry
250 $ent2:
251 CMPCY( r4, r25), r20 C L0 lo add => carry
252 bis r31, r31, r31 C U1 mt
253 lda r18, -8(r18) C L1 bookkeeping
254 ADDSUB r25, r24, r22 C U0 hi add => answer
256 bis r31, r31, r31 C U1 mt
257 CMPCY( r25, r22), r21 C L0 hi add => carry
258 addq r3, r20, r3 C U0 hi mul + carry
259 ldq r0, 16(up) C L1
261 bis r31, r31, r31 C U1 mt
262 ADDSUB r5, r28, r28 C L0 lo + acc
263 addq r3, r21, r3 C U0 hi mul + carry
264 ldq r4, 16(rp) C L1
266 umulh v0, r1, r24 C U1
267 CMPCY( r5, r28), r20 C L0 lo add => carry
268 ADDSUB r28, r3, r23 C U0 hi add => answer
269 ldq r1, 24(up) C L1
271 mulq v0, r0, r25 C U1
272 CMPCY( r28, r23), r21 C L0 hi add => carry
273 addq r8, r20, r8 C U0 hi mul + carry
274 ldq r5, 24(rp) C L1
276 umulh v0, r0, r3 C U1
277 ADDSUB r4, r2, r2 C U0 lo + acc
278 stq r22, 0(rp) C L0
279 stq r23, 8(rp) C L1
281 bis r31, r31, r31 C L0 st slosh
282 mulq v0, r1, r28 C U1
283 bis r31, r31, r31 C L1 st slosh
284 addq r8, r21, r8 C U0 hi mul + carry
285 $ent0:
286 CMPCY( r4, r2), r20 C L0 lo add => carry
287 bis r31, r31, r31 C U1 mt
288 lda up, 64(up) C L1 bookkeeping
289 ADDSUB r2, r8, r22 C U0 hi add => answer
291 bis r31, r31, r31 C U1 mt
292 CMPCY( r2, r22), r21 C L0 hi add => carry
293 addq r6, r20, r6 C U0 hi mul + carry
294 ldq r0, -32(up) C L1
296 bis r31, r31, r31 C U1 mt
297 ADDSUB r5, r7, r7 C L0 lo + acc
298 addq r6, r21, r6 C U0 hi mul + carry
299 ldq r4, 32(rp) C L1
301 umulh v0, r1, r8 C U1
302 CMPCY( r5, r7), r20 C L0 lo add => carry
303 ADDSUB r7, r6, r23 C U0 hi add => answer
304 ldq r1, -24(up) C L1
306 mulq v0, r0, r2 C U1
307 CMPCY( r7, r23), r21 C L0 hi add => carry
308 addq r24, r20, r24 C U0 hi mul + carry
309 ldq r5, 40(rp) C L1
311 umulh v0, r0, r6 C U1
312 ADDSUB r4, r25, r25 C U0 lo + acc
313 stq r22, 16(rp) C L0
314 stq r23, 24(rp) C L1
316 bis r31, r31, r31 C L0 st slosh
317 mulq v0, r1, r7 C U1
318 bis r31, r31, r31 C L1 st slosh
319 addq r24, r21, r24 C U0 hi mul + carry
320 $ent6:
321 CMPCY( r4, r25), r20 C L0 lo add => carry
322 bis r31, r31, r31 C U1 mt
323 lda rp, 64(rp) C L1 bookkeeping
324 ADDSUB r25, r24, r22 C U0 hi add => answer
326 bis r31, r31, r31 C U1 mt
327 CMPCY( r25, r22), r21 C L0 hi add => carry
328 addq r3, r20, r3 C U0 hi mul + carry
329 ldq r0, -16(up) C L1
331 bis r31, r31, r31 C U1 mt
332 ADDSUB r5, r28, r28 C L0 lo + acc
333 addq r3, r21, r3 C U0 hi mul + carry
334 ldq r4, -16(rp) C L1
336 umulh v0, r1, r24 C U1
337 CMPCY( r5, r28), r20 C L0 lo add => carry
338 ADDSUB r28, r3, r23 C U0 hi add => answer
339 ldq r1, -8(up) C L1
341 mulq v0, r0, r25 C U1
342 CMPCY( r28, r23), r21 C L0 hi add => carry
343 addq r8, r20, r8 C U0 hi mul + carry
344 ldq r5, -8(rp) C L1
346 umulh v0, r0, r3 C U1
347 ADDSUB r4, r2, r2 C U0 lo + acc
348 stq r22, -32(rp) C L0
349 stq r23, -24(rp) C L1
351 bis r31, r31, r31 C L0 st slosh
352 mulq v0, r1, r28 C U1
353 bis r31, r31, r31 C L1 st slosh
354 addq r8, r21, r8 C U0 hi mul + carry
356 CMPCY( r4, r2), r20 C L0 lo add => carry
357 ADDSUB r2, r8, r22 C U0 hi add => answer
358 ldl r31, 256(up) C prefetch up[]
359 bgt r18, $Loop C U1 bookkeeping
361 $Lend: CMPCY( r2, r22), r21 C
362 addq r6, r20, r6 C
363 ADDSUB r5, r7, r7 C
364 addq r6, r21, r6 C
365 ldq r4, 0(rp) C
366 umulh v0, r1, r8 C
367 CMPCY( r5, r7), r20 C
368 ADDSUB r7, r6, r23 C
369 CMPCY(r7, r23), r21 C
370 addq r24, r20, r24 C
371 ldq r5, 8(rp) C
372 ADDSUB r4, r25, r25 C
373 stq r22, -16(rp) C
374 stq r23, -8(rp) C
375 addq r24, r21, r24 C
376 br L(x)
378 ALIGN(16)
379 $n23: ldq r4, 0(rp) C
380 ldq r5, 8(rp) C
381 umulh v0, r1, r8 C
382 ADDSUB r4, r25, r25 C
383 L(x): CMPCY( r4, r25), r20 C
384 ADDSUB r25, r24, r22 C
385 CMPCY( r25, r22), r21 C
386 addq r3, r20, r3 C
387 ADDSUB r5, r28, r28 C
388 addq r3, r21, r3 C
389 CMPCY( r5, r28), r20 C
390 ADDSUB r28, r3, r23 C
391 CMPCY( r28, r23), r21 C
392 addq r8, r20, r8 C
393 stq r22, 0(rp) C
394 stq r23, 8(rp) C
395 addq r8, r21, r0 C
396 ret r31, (r26), 1 C
397 EPILOGUE()
398 ASM_END()