beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / ia64 / lorrshift.asm
blob694aaf0f400f5877ef748988abf91cba9f3920f2
1 dnl IA-64 mpn_lshift/mpn_rshift.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2000-2005 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C Itanium: 2
37 C Itanium 2: 1
39 C This code is scheduled deeply since the plain shift instructions shr and shl
40 C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of
41 C these instructions cause a 10 cycle replay trap on Itanium.
43 C The ld8 scheduling should probably be decreased to make the function smaller.
44 C Good lfetch will make sure we never stall anyway.
46 C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
47 C at cycle 2. Judicious use of predicates could allow us to issue more ld8's
48 C in the prologue.
51 C INPUT PARAMETERS
52 define(`rp', `r32')
53 define(`up', `r33')
54 define(`n', `r34')
55 define(`cnt',`r35')
57 define(`tnc',`r9')
59 ifdef(`OPERATION_lshift',`
60 define(`FSH',`shl')
61 define(`BSH',`shr.u')
62 define(`UPD',`-8')
63 define(`POFF',`-512')
64 define(`PUPD',`-32')
65 define(`func',`mpn_lshift')
67 ifdef(`OPERATION_rshift',`
68 define(`FSH',`shr.u')
69 define(`BSH',`shl')
70 define(`UPD',`8')
71 define(`POFF',`512')
72 define(`PUPD',`32')
73 define(`func',`mpn_rshift')
76 MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
78 ASM_START()
79 PROLOGUE(func)
80 .prologue
81 .save ar.lc, r2
82 .body
83 ifdef(`HAVE_ABI_32',
84 ` addp4 rp = 0, rp C M I
85 addp4 up = 0, up C M I
86 sxt4 n = n C M I
87 nop.m 0
88 nop.m 0
89 zxt4 cnt = cnt C I
93 {.mmi; cmp.lt p14, p15 = 4, n C M I
94 and r14 = 3, n C M I
95 mov.i r2 = ar.lc C I0
96 }{.mmi; add r15 = -1, n C M I
97 sub tnc = 64, cnt C M I
98 add r16 = -5, n
100 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I
101 cmp.eq p7, p0 = 2, r14 C M I
102 shr.u n = r16, 2 C I0
103 }{.mmi; cmp.eq p8, p0 = 3, r14 C M I
104 ifdef(`OPERATION_lshift',
105 ` shladd up = r15, 3, up C M I
106 shladd rp = r15, 3, rp') C M I
108 }{.mmi; add r11 = POFF, up C M I
109 ld8 r10 = [up], UPD C M01
110 mov.i ar.lc = n C I0
111 }{.bbb;
112 (p6) br.dptk .Lb01
113 (p7) br.dptk .Lb10
114 (p8) br.dptk .Lb11
115 ;; }
117 .Lb00: ld8 r19 = [up], UPD
119 ld8 r16 = [up], UPD
121 ld8 r17 = [up], UPD
122 BSH r8 = r10, tnc C function return value
124 FSH r24 = r10, cnt
125 BSH r25 = r19, tnc
126 (p14) br.cond.dptk .grt4
128 FSH r26 = r19, cnt
129 BSH r27 = r16, tnc
131 FSH r20 = r16, cnt
132 BSH r21 = r17, tnc
134 or r14 = r25, r24
135 FSH r22 = r17, cnt
136 BSH r23 = r10, tnc
137 br .Lr4
139 .grt4: ld8 r18 = [up], UPD
140 FSH r26 = r19, cnt
141 BSH r27 = r16, tnc
143 ld8 r19 = [up], UPD
144 FSH r20 = r16, cnt
145 BSH r21 = r17, tnc
147 ld8 r16 = [up], UPD
148 FSH r22 = r17, cnt
149 BSH r23 = r18, tnc
151 or r14 = r25, r24
152 ld8 r17 = [up], UPD
153 br.cloop.dpnt .Ltop
154 br .Lbot
156 .Lb01:
157 (p15) BSH r8 = r10, tnc C function return value I
158 (p15) FSH r22 = r10, cnt C I
159 (p15) br.cond.dptk .Lr1 C return B
161 .grt1: ld8 r18 = [up], UPD
163 ld8 r19 = [up], UPD
164 BSH r8 = r10, tnc C function return value
166 ld8 r16 = [up], UPD
167 FSH r22 = r10, cnt
168 BSH r23 = r18, tnc
170 ld8 r17 = [up], UPD
171 FSH r24 = r18, cnt
172 BSH r25 = r19, tnc
173 br.cloop.dpnt .grt5
175 or r15 = r23, r22
176 FSH r26 = r19, cnt
177 BSH r27 = r16, tnc
179 FSH r20 = r16, cnt
180 BSH r21 = r17, tnc
181 br .Lr5
183 .grt5: ld8 r18 = [up], UPD
184 FSH r26 = r19, cnt
185 BSH r27 = r16, tnc
187 ld8 r19 = [up], UPD
188 FSH r20 = r16, cnt
189 BSH r21 = r17, tnc
191 or r15 = r23, r22
192 ld8 r16 = [up], UPD
193 br .LL01
196 .Lb10: ld8 r17 = [up], UPD
197 (p14) br.cond.dptk .grt2
199 BSH r8 = r10, tnc C function return value
201 FSH r20 = r10, cnt
202 BSH r21 = r17, tnc
204 or r14 = r21, r20
205 FSH r22 = r17, cnt
206 br .Lr2 C return
208 .grt2: ld8 r18 = [up], UPD
209 BSH r8 = r10, tnc C function return value
211 ld8 r19 = [up], UPD
212 FSH r20 = r10, cnt
213 BSH r21 = r17, tnc
215 ld8 r16 = [up], UPD
216 FSH r22 = r17, cnt
217 BSH r23 = r18, tnc
219 {.mmi; ld8 r17 = [up], UPD
220 or r14 = r21, r20
221 FSH r24 = r18, cnt
222 }{.mib; nop 0
223 BSH r25 = r19, tnc
224 br.cloop.dpnt .grt6
225 ;; }
227 FSH r26 = r19, cnt
228 BSH r27 = r16, tnc
229 br .Lr6
231 .grt6: ld8 r18 = [up], UPD
232 FSH r26 = r19, cnt
233 BSH r27 = r16, tnc
235 ld8 r19 = [up], UPD
236 br .LL10
239 .Lb11: ld8 r16 = [up], UPD
241 ld8 r17 = [up], UPD
242 BSH r8 = r10, tnc C function return value
243 (p14) br.cond.dptk .grt3
246 FSH r26 = r10, cnt
247 BSH r27 = r16, tnc
249 FSH r20 = r16, cnt
250 BSH r21 = r17, tnc
252 or r15 = r27, r26
253 FSH r22 = r17, cnt
254 br .Lr3 C return
256 .grt3: ld8 r18 = [up], UPD
257 FSH r26 = r10, cnt
258 BSH r27 = r16, tnc
260 ld8 r19 = [up], UPD
261 FSH r20 = r16, cnt
262 BSH r21 = r17, tnc
264 ld8 r16 = [up], UPD
265 FSH r22 = r17, cnt
266 BSH r23 = r18, tnc
268 ld8 r17 = [up], UPD
269 br.cloop.dpnt .grt7
271 or r15 = r27, r26
272 FSH r24 = r18, cnt
273 BSH r25 = r19, tnc
274 br .Lr7
276 .grt7: or r15 = r27, r26
277 FSH r24 = r18, cnt
278 BSH r25 = r19, tnc
279 ld8 r18 = [up], UPD
280 br .LL11
282 C *** MAIN LOOP START ***
283 ALIGN(32)
284 .Ltop:
285 {.mmi; st8 [rp] = r14, UPD C M2
286 or r15 = r27, r26 C M3
287 FSH r24 = r18, cnt C I0
288 }{.mmi; ld8 r18 = [up], UPD C M1
289 lfetch [r11], PUPD
290 BSH r25 = r19, tnc C I1
291 ;; }
292 .LL11:
293 {.mmi; st8 [rp] = r15, UPD
294 or r14 = r21, r20
295 FSH r26 = r19, cnt
296 }{.mmi; ld8 r19 = [up], UPD
297 nop.m 0
298 BSH r27 = r16, tnc
299 ;; }
300 .LL10:
301 {.mmi; st8 [rp] = r14, UPD
302 or r15 = r23, r22
303 FSH r20 = r16, cnt
304 }{.mmi; ld8 r16 = [up], UPD
305 nop.m 0
306 BSH r21 = r17, tnc
307 ;; }
308 .LL01:
309 {.mmi; st8 [rp] = r15, UPD
310 or r14 = r25, r24
311 FSH r22 = r17, cnt
312 }{.mib; ld8 r17 = [up], UPD
313 BSH r23 = r18, tnc
314 br.cloop.dptk .Ltop
315 ;; }
316 C *** MAIN LOOP END ***
318 .Lbot:
319 {.mmi; st8 [rp] = r14, UPD
320 or r15 = r27, r26
321 FSH r24 = r18, cnt
322 }{.mib; nop 0
323 BSH r25 = r19, tnc
324 nop 0
325 ;; }
326 .Lr7:
327 {.mmi; st8 [rp] = r15, UPD
328 or r14 = r21, r20
329 FSH r26 = r19, cnt
330 }{.mib; nop 0
331 BSH r27 = r16, tnc
332 nop 0
333 ;; }
334 .Lr6:
335 {.mmi; st8 [rp] = r14, UPD
336 or r15 = r23, r22
337 FSH r20 = r16, cnt
338 }{.mib; nop 0
339 BSH r21 = r17, tnc
340 nop 0
341 ;; }
342 .Lr5: st8 [rp] = r15, UPD
343 or r14 = r25, r24
344 FSH r22 = r17, cnt
346 .Lr4: st8 [rp] = r14, UPD
347 or r15 = r27, r26
349 .Lr3: st8 [rp] = r15, UPD
350 or r14 = r21, r20
352 .Lr2: st8 [rp] = r14, UPD
354 .Lr1: st8 [rp] = r22, UPD C M23
355 mov ar.lc = r2 C I0
356 br.ret.sptk.many b0 C B
357 EPILOGUE(func)
358 ASM_END()