beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / ia64 / rsh1aors_n.asm
blob3c7defb0baeb493330eb37f09193745bc84a0017
1 dnl IA-64 mpn_rsh1add_n/mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2003-2005 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C Itanium: 2.5
37 C Itanium 2: 1.5
39 C TODO
40 C * Rewrite function entry code using aorslsh1_n.asm style.
41 C * Micro-optimize feed-in and wind-down code.
43 C INPUT PARAMETERS
44 define(`rp',`r32')
45 define(`up',`r33')
46 define(`vp',`r34')
47 define(`n',`r35')
49 ifdef(`OPERATION_rsh1add_n',`
50 define(ADDSUB, add)
51 define(PRED, ltu)
52 define(INCR, 1)
53 define(LIM, -1)
54 define(func, mpn_rsh1add_n)
56 ifdef(`OPERATION_rsh1sub_n',`
57 define(ADDSUB, sub)
58 define(PRED, gtu)
59 define(INCR, -1)
60 define(LIM, 0)
61 define(func, mpn_rsh1sub_n)
64 C Some useful aliases for registers we use
65 define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
66 define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
67 define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
68 define(`x0',`r26') define(`x1',`r9') define(`x2',`r30') define(`x3',`r31')
70 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
72 ASM_START()
73 PROLOGUE(func)
74 .prologue
75 .save ar.lc, r2
76 .body
77 ifdef(`HAVE_ABI_32',`
78 addp4 rp = 0, rp C M I
79 addp4 up = 0, up C M I
80 addp4 vp = 0, vp C M I
81 nop.m 0
82 nop.m 0
83 zxt4 n = n C I
86 {.mmi; ld8 r11 = [vp], 8 C M01
87 ld8 r10 = [up], 8 C M01
88 mov.i r2 = ar.lc C I0
89 }{.mmi; and r14 = 3, n C M I
90 cmp.lt p15, p0 = 4, n C M I
91 add n = -4, n C M I
93 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I
94 cmp.eq p7, p0 = 2, r14 C M I
95 cmp.eq p8, p0 = 3, r14 C M I
96 }{.bbb
97 (p6) br.dptk .Lb01 C B
98 (p7) br.dptk .Lb10 C B
99 (p8) br.dptk .Lb11 C B
102 .Lb00: ld8 v0 = [vp], 8 C M01
103 ld8 u0 = [up], 8 C M01
104 shr.u n = n, 2 C I0
106 ld8 v1 = [vp], 8 C M01
107 ld8 u1 = [up], 8 C M01
108 ADDSUB w3 = r10, r11 C M I
110 ld8 v2 = [vp], 8 C M01
111 ld8 u2 = [up], 8 C M01
112 (p15) br.dpnt .grt4 C B
115 cmp.PRED p7, p0 = w3, r10 C M I
116 and r8 = 1, w3 C M I
117 ADDSUB w0 = u0, v0 C M I
119 cmp.PRED p8, p0 = w0, u0 C M I
120 ADDSUB w1 = u1, v1 C M I
122 cmp.PRED p9, p0 = w1, u1 C M I
123 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
124 (p7) add w0 = INCR, w0 C M I
126 shrp x3 = w0, w3, 1 C I0
127 ADDSUB w2 = u2, v2 C M I
128 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
129 (p8) add w1 = INCR, w1 C M I
130 br .Lcj4 C B
132 .grt4: ld8 v3 = [vp], 8 C M01
133 cmp.PRED p7, p0 = w3, r10 C M I
134 ld8 u3 = [up], 8 C M01
135 and r8 = 1, w3 C M I
137 ADDSUB w0 = u0, v0 C M I
138 ld8 v0 = [vp], 8 C M01
139 add n = -1, n
141 cmp.PRED p8, p0 = w0, u0 C M I
142 ld8 u0 = [up], 8 C M01
143 ADDSUB w1 = u1, v1 C M I
145 ld8 v1 = [vp], 8 C M01
146 mov.i ar.lc = n C I0
147 cmp.PRED p9, p0 = w1, u1 C M I
148 ld8 u1 = [up], 8 C M01
149 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
150 (p7) add w0 = INCR, w0 C M I
152 ADDSUB w2 = u2, v2 C M I
153 ld8 v2 = [vp], 8 C M01
154 shrp x3 = w0, w3, 1 C I0
155 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
156 (p8) add w1 = INCR, w1 C M I
157 br .LL00 C B
160 .Lb01: ADDSUB w2 = r10, r11 C M I
161 shr.u n = n, 2 C I0
162 (p15) br.dpnt .grt1 C B
165 cmp.PRED p6, p7 = w2, r10 C M I
166 shr.u x2 = w2, 1 C I0
167 and r8 = 1, w2 C M I
169 (p6) dep x2 = -1, x2, 63, 1 C I0
170 br .Lcj1 C B
172 .grt1: ld8 v3 = [vp], 8 C M01
173 ld8 u3 = [up], 8 C M01
175 ld8 v0 = [vp], 8 C M01
176 ld8 u0 = [up], 8 C M01
177 mov.i ar.lc = n C FIXME swap with next I0
179 ld8 v1 = [vp], 8 C M01
180 ld8 u1 = [up], 8 C M01
182 ld8 v2 = [vp], 8 C M01
183 ld8 u2 = [up], 8 C M01
184 cmp.PRED p6, p0 = w2, r10 C M I
185 and r8 = 1, w2 C M I
186 ADDSUB w3 = u3, v3 C M I
187 br.cloop.dptk .grt5 C B
190 cmp.PRED p7, p0 = w3, u3 C M I
192 ADDSUB w0 = u0, v0 C M I
193 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
194 (p6) add w3 = INCR, w3 C M I
196 cmp.PRED p8, p0 = w0, u0 C M I
197 shrp x2 = w3, w2, 1 C I0
198 ADDSUB w1 = u1, v1 C M I
200 cmp.PRED p9, p0 = w1, u1 C M I
201 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
202 (p7) add w0 = INCR, w0 C M I
203 br .Lcj5 C B
205 .grt5: ld8 v3 = [vp], 8 C M01
206 cmp.PRED p7, p0 = w3, u3 C M I
207 ld8 u3 = [up], 8 C M01
209 ADDSUB w0 = u0, v0 C M I
210 ld8 v0 = [vp], 8 C M01
211 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
212 (p6) add w3 = INCR, w3 C M I
214 cmp.PRED p8, p0 = w0, u0 C M I
215 shrp x2 = w3, w2, 1 C I0
216 ld8 u0 = [up], 8 C M01
217 ADDSUB w1 = u1, v1 C M I
219 ld8 v1 = [vp], 8 C M01
220 cmp.PRED p9, p0 = w1, u1 C M I
221 ld8 u1 = [up], 8 C M01
222 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
223 (p7) add w0 = INCR, w0 C M I
224 br .LL01 C B
227 .Lb10: ld8 v2 = [vp], 8 C M01
228 ld8 u2 = [up], 8 C M01
229 shr.u n = n, 2 C I0
230 ADDSUB w1 = r10, r11 C M I
231 (p15) br.dpnt .grt2 C B
234 cmp.PRED p9, p0 = w1, r10 C M I
235 and r8 = 1, w1 C M I
236 ADDSUB w2 = u2, v2 C M I
238 cmp.PRED p6, p0 = w2, u2 C M I
240 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
241 (p9) add w2 = INCR, w2 C M I
243 shrp x1 = w2, w1, 1 C I0
244 shr.u x2 = w2, 1 C I0
245 br .Lcj2 C B
247 .grt2: ld8 v3 = [vp], 8 C M01
248 ld8 u3 = [up], 8 C M01
250 ld8 v0 = [vp], 8 C M01
251 ld8 u0 = [up], 8 C M01
252 mov.i ar.lc = n C I0
254 ld8 v1 = [vp], 8 C M01
255 cmp.PRED p9, p0 = w1, r10 C M I
256 ld8 u1 = [up], 8 C M01
257 and r8 = 1, w1 C M I
259 ADDSUB w2 = u2, v2 C M I
260 ld8 v2 = [vp], 8 C M01
262 cmp.PRED p6, p0 = w2, u2 C M I
263 ld8 u2 = [up], 8 C M01
264 ADDSUB w3 = u3, v3 C M I
265 br.cloop.dptk .grt6 C B
268 cmp.PRED p7, p0 = w3, u3 C M I
269 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
270 (p9) add w2 = INCR, w2 C M I
272 shrp x1 = w2, w1, 1 C I0
273 ADDSUB w0 = u0, v0 C M I
274 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
275 (p6) add w3 = INCR, w3 C M I
276 br .Lcj6 C B
278 .grt6: ld8 v3 = [vp], 8 C M01
279 cmp.PRED p7, p0 = w3, u3 C M I
280 ld8 u3 = [up], 8 C M01
281 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
282 (p9) add w2 = INCR, w2 C M I
284 shrp x1 = w2, w1, 1 C I0
285 ADDSUB w0 = u0, v0 C M I
286 ld8 v0 = [vp], 8 C M01
287 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
288 (p6) add w3 = INCR, w3 C M I
289 br .LL10 C B
292 .Lb11: ld8 v1 = [vp], 8 C M01
293 ld8 u1 = [up], 8 C M01
294 shr.u n = n, 2 C I0
296 ld8 v2 = [vp], 8 C M01
297 ld8 u2 = [up], 8 C M01
298 ADDSUB w0 = r10, r11 C M I
299 (p15) br.dpnt .grt3 C B
302 cmp.PRED p8, p0 = w0, r10 C M I
303 ADDSUB w1 = u1, v1 C M I
304 and r8 = 1, w0 C M I
306 cmp.PRED p9, p0 = w1, u1 C M I
308 ADDSUB w2 = u2, v2 C M I
309 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
310 (p8) add w1 = INCR, w1 C M I
312 cmp.PRED p6, p0 = w2, u2 C M I
313 shrp x0 = w1, w0, 1 C I0
315 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
316 (p9) add w2 = INCR, w2 C M I
317 br .Lcj3 C B
319 .grt3: ld8 v3 = [vp], 8 C M01
320 ld8 u3 = [up], 8 C M01
322 ld8 v0 = [vp], 8 C M01
323 mov.i ar.lc = n C I0
324 cmp.PRED p8, p0 = w0, r10 C M I
325 ld8 u0 = [up], 8 C M01
326 ADDSUB w1 = u1, v1 C M I
327 and r8 = 1, w0 C M I
329 ld8 v1 = [vp], 8 C M01
330 cmp.PRED p9, p0 = w1, u1 C M I
331 ld8 u1 = [up], 8 C M01
333 ADDSUB w2 = u2, v2 C M I
334 ld8 v2 = [vp], 8 C M01
335 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
336 (p8) add w1 = INCR, w1 C M I
338 cmp.PRED p6, p0 = w2, u2 C M I
339 shrp x0 = w1, w0, 1 C I0
340 ld8 u2 = [up], 8 C M01
341 ADDSUB w3 = u3, v3 C M I
342 br.cloop.dptk .grt7 C B
345 cmp.PRED p7, p0 = w3, u3 C M I
346 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
347 (p9) add w2 = INCR, w2 C M I
348 br .Lcj7 C B
350 .grt7: ld8 v3 = [vp], 8 C M01
351 cmp.PRED p7, p0 = w3, u3 C M I
352 ld8 u3 = [up], 8 C M01
353 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
354 (p9) add w2 = INCR, w2 C M I
355 br .LL11 C B
358 C *** MAIN LOOP START ***
359 ALIGN(32)
360 .Loop: st8 [rp] = x3, 8 C M23
361 ld8 v3 = [vp], 8 C M01
362 cmp.PRED p7, p0 = w3, u3 C M I
363 ld8 u3 = [up], 8 C M01
364 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
365 (p9) add w2 = INCR, w2 C M I
367 .LL11: st8 [rp] = x0, 8 C M23
368 shrp x1 = w2, w1, 1 C I0
369 ADDSUB w0 = u0, v0 C M I
370 ld8 v0 = [vp], 8 C M01
371 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
372 (p6) add w3 = INCR, w3 C M I
374 .LL10: cmp.PRED p8, p0 = w0, u0 C M I
375 shrp x2 = w3, w2, 1 C I0
376 nop.b 0
377 ld8 u0 = [up], 8 C M01
378 ADDSUB w1 = u1, v1 C M I
379 nop.b 0
381 st8 [rp] = x1, 8 C M23
382 ld8 v1 = [vp], 8 C M01
383 cmp.PRED p9, p0 = w1, u1 C M I
384 ld8 u1 = [up], 8 C M01
385 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
386 (p7) add w0 = INCR, w0 C M I
388 .LL01: st8 [rp] = x2, 8 C M23
389 shrp x3 = w0, w3, 1 C I0
390 ADDSUB w2 = u2, v2 C M I
391 ld8 v2 = [vp], 8 C M01
392 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
393 (p8) add w1 = INCR, w1 C M I
395 .LL00: cmp.PRED p6, p0 = w2, u2 C M I
396 shrp x0 = w1, w0, 1 C I0
397 nop.b 0
398 ld8 u2 = [up], 8 C M01
399 ADDSUB w3 = u3, v3 C M I
400 br.cloop.dptk .Loop C B
402 C *** MAIN LOOP END ***
404 .Lskip: st8 [rp] = x3, 8 C M23
405 cmp.PRED p7, p0 = w3, u3 C M I
406 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
407 (p9) add w2 = INCR, w2 C M I
409 .Lcj7: st8 [rp] = x0, 8 C M23
410 shrp x1 = w2, w1, 1 C I0
411 ADDSUB w0 = u0, v0 C M I
412 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
413 (p6) add w3 = INCR, w3 C M I
415 .Lcj6: cmp.PRED p8, p0 = w0, u0 C M I
416 shrp x2 = w3, w2, 1 C I0
417 ADDSUB w1 = u1, v1 C M I
419 st8 [rp] = x1, 8 C M23
420 cmp.PRED p9, p0 = w1, u1 C M I
421 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
422 (p7) add w0 = INCR, w0 C M I
424 .Lcj5: st8 [rp] = x2, 8 C M23
425 shrp x3 = w0, w3, 1 C I0
426 ADDSUB w2 = u2, v2 C M I
427 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
428 (p8) add w1 = INCR, w1 C M I
430 .Lcj4: cmp.PRED p6, p0 = w2, u2 C M I
431 shrp x0 = w1, w0, 1 C I0
433 st8 [rp] = x3, 8 C M23
434 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
435 (p9) add w2 = INCR, w2 C M I
437 .Lcj3: st8 [rp] = x0, 8 C M23
438 shrp x1 = w2, w1, 1 C I0
439 shr.u x2 = w2, 1 C I0
441 .Lcj2: st8 [rp] = x1, 8 C M23
442 (p6) dep x2 = -1, x2, 63, 1 C I0
444 .Lcj1: st8 [rp] = x2 C M23
445 mov.i ar.lc = r2 C I0
446 br.ret.sptk.many b0 C B
447 EPILOGUE()