1 dnl IA
-64 mpn_lshift
/mpn_rshift.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright
2000-2005 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
39 C This code is scheduled deeply since the plain shift instructions shr and shl
40 C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of
41 C these instructions cause a 10 cycle replay trap on Itanium.
43 C The ld8 scheduling should probably be decreased to make the function smaller.
44 C Good lfetch will make sure we never stall anyway.
46 C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
47 C at cycle 2. Judicious use of predicates could allow us to issue more ld8's
59 ifdef
(`OPERATION_lshift
',`
65 define(`func',`mpn_lshift
')
67 ifdef
(`OPERATION_rshift
',`
73 define(`func',`mpn_rshift
')
76 MULFUNC_PROLOGUE
(mpn_lshift mpn_rshift
)
84 ` addp4 rp = 0, rp C M I
85 addp4 up = 0, up C M I
93 {.mmi; cmp.lt p14, p15 = 4, n C M I
96 }{.mmi; add r15 = -1, n C M I
97 sub tnc = 64, cnt C M I
100 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I
101 cmp.eq p7, p0 = 2, r14 C M I
102 shr.u n = r16, 2 C I0
103 }{.mmi; cmp.eq p8, p0 = 3, r14 C M I
104 ifdef(`OPERATION_lshift',
105 ` shladd up = r15, 3, up C M I
106 shladd rp = r15, 3, rp') C M I
108 }{.mmi; add r11 = POFF, up C M I
109 ld8 r10 = [up], UPD C M01
117 .
Lb00: ld8 r19
= [up
], UPD
122 BSH r8
= r10
, tnc C function return value
126 (p14
) br.cond.dptk .grt4
139 .
grt4: ld8 r18
= [up
], UPD
157 (p15
) BSH r8
= r10
, tnc C function return value I
158 (p15
) FSH r22
= r10
, cnt C I
159 (p15
) br.cond.dptk .Lr1 C return B
161 .
grt1: ld8 r18
= [up
], UPD
164 BSH r8
= r10
, tnc C function return value
183 .
grt5: ld8 r18
= [up
], UPD
196 .
Lb10: ld8 r17
= [up
], UPD
197 (p14
) br.cond.dptk .grt2
199 BSH r8
= r10
, tnc C function return value
208 .
grt2: ld8 r18
= [up
], UPD
209 BSH r8
= r10
, tnc C function return value
219 {.mmi; ld8 r17 = [up], UPD
231 .
grt6: ld8 r18
= [up
], UPD
239 .
Lb11: ld8 r16
= [up
], UPD
242 BSH r8
= r10
, tnc C function return value
243 (p14
) br.cond.dptk .grt3
256 .
grt3: ld8 r18
= [up
], UPD
276 .
grt7: or r15
= r27
, r26
282 C
*** MAIN
LOOP START
***
285 {.mmi; st8 [rp] = r14, UPD C M2
286 or r15 = r27, r26 C M3
287 FSH r24 = r18, cnt C I0
288 }{.mmi; ld8 r18 = [up], UPD C M1
290 BSH r25 = r19, tnc C I1
293 {.mmi; st8 [rp] = r15, UPD
296 }{.mmi; ld8 r19 = [up], UPD
301 {.mmi; st8 [rp] = r14, UPD
304 }{.mmi; ld8 r16 = [up], UPD
309 {.mmi; st8 [rp] = r15, UPD
312 }{.mib; ld8 r17 = [up], UPD
316 C
*** MAIN
LOOP END ***
319 {.mmi; st8 [rp] = r14, UPD
327 {.mmi; st8 [rp] = r15, UPD
335 {.mmi; st8 [rp] = r14, UPD
342 .
Lr5: st8
[rp
] = r15
, UPD
346 .
Lr4: st8
[rp
] = r14
, UPD
349 .
Lr3: st8
[rp
] = r15
, UPD
352 .
Lr2: st8
[rp
] = r14
, UPD
354 .
Lr1: st8
[rp
] = r22
, UPD C M23
356 br.
ret.sptk.many b0 C B