beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / ia64 / sqr_diag_addlsh1.asm
blob727f489b12776ba1cf6a60f73ceeba1160f28cd8
1 dnl IA-64 mpn_sqr_diag_addlsh1
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2010, 2011 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C Itanium: ?
37 C Itanium 2: 2 Unrolling could bring it to 1.5 + epsilon
39 C Exact performance table. The 2nd line is this code, the 3rd line is ctop-
40 C less code. In an assembly sqr_basecase, the ctop-full numbers will become a
41 C few cycles better since we can mitigate the many I0 instructions.
43 C 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
44 C - 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 Needs updating
45 C - 13 16 17 18 20 21 23 25 26 30 31 31 33 34 36 38 39 42 43
47 C We should keep in mind that this code takes linear time in a O(n^2) context
48 C and that it will only be used under SQR_TOOM2_THRESHOLD, which might become
49 C around 60. Keeping overhead down for smallish operands (< 10) is more
50 C important than optimal cycle counts.
52 C TODO
53 C * Make sure we don't depend on uninitialised r-registers, f-registers, or
54 C * p-registers.
55 C * Optimise by doing first two loop iterations in function header.
57 C INPUT PARAMETERS
58 define(`rp_param', `r32') define(`rp', `r14') C size: 2n
59 define(`tp_param', `r33') define(`tp', `r15') C size: 2n - 2
60 define(`up_param', `r34') define(`up', `r31') C size: n
61 define(`n', `r35')
63 ifdef(`HAVE_ABI_32',`
64 define(`ABI64', `')
65 define(`ABI32', `$1')
66 ',`
67 define(`ABI64', `$1')
68 define(`ABI32', `')
71 ASM_START()
72 PROLOGUE(mpn_sqr_diag_addlsh1)
74 .prologue
75 .save ar.pfs, r2
76 .save ar.lc, r3
77 .body
79 {.mii; alloc r2 = ar.pfs, 4,24,0,24 C M
80 mov r3 = ar.lc C I0
81 ABI64(` nop 4711 ')
82 ABI32(` zxt4 n = n ')
83 }{.mmi; ABI64(` mov tp = tp_param ') C M I
84 ABI32(` addp4 tp = 0, tp_param') C M I
85 ABI64(` mov up = up_param ') C M I
86 ABI32(` addp4 up = 0, up_param') C M I
87 ABI64(` mov rp = rp_param ') C M I
88 ABI32(` addp4 rp = 0, rp_param') C M I
90 }{.mmi; ld8 r36 = [tp], 8 C M
91 add r20 = -2, n C M I
92 mov r9 = ar.ec C I0
94 }{.mmi; ld8 r32 = [tp], 8 C M
95 mov r16 = 0 C M I
96 mov ar.ec = 7 C I0
98 }{.mmi; nop 4711
99 mov r44 = 0 C M I
100 mov ar.lc = r20 C I0
102 }{.mii; mov r33 = 0
103 mov r10 = pr C I0
104 mov pr.rot = 0x30000 C I0
106 } br.cexit.spnt.few.clr L(end)
108 dnl *** MAIN LOOP START ***
109 ALIGN(32)
110 L(top):
111 {.mfi; (p18) ldf8 f33 = [up], 8 C M
112 (p20) xma.l f36 = f35, f35, f42 C F
113 (p41) cmpequc p50, p0 = -1, r44 C M I
114 }{.mfi; setfsig f40 = r16 C M23
115 (p20) xma.hu f38 = f35, f35, f42 C F
116 (p23) add r50 = r41, r49 C M I
118 }{.mmi; (p16) ld8 r36 = [tp], 8 C M
119 (p23) cmpltu p40, p0 = r50, r41 C cyout hi M I
120 (p19) shrp r45 = r38, r35, 63 C non-critical I0
121 }{.mmi; (p21) getfsig r39 = f39 C hi M2
122 (p24) st8 [rp] = r51, 8 C hi M23
123 (p41) add r44 = 1, r44 C M I
125 }{.mmi; (p16) ld8 r32 = [tp], 8 C M
126 (p50) cmpeqor p40, p0 = -1, r50 C cyout hi M I
127 (p17) shrp r16 = r33, r37, 63 C critical I0
128 }{.mmi; (p21) getfsig r42 = f37 C lo M2
129 (p23) st8 [rp] = r44, 8 C lo M23
130 (p50) add r50 = 1, r50 C M I
132 } br.ctop.sptk.few.clr L(top) C B
133 dnl *** MAIN LOOP END ***
135 L(end):
136 {.mmi; nop 4711
137 (p41) add r44 = 1, r44 C M I
138 shr.u r48 = r39, 63 C I0
140 }{.mmi; st8 [rp] = r51, 8 C M23
141 (p41) cmpequc p6, p0 = 0, r44 C M I
142 add r50 = r41, r48 C M I
144 }{.mmi; st8 [rp] = r44, 8 C M23
145 (p6) add r50 = 1, r50 C M I
146 mov ar.lc = r3 C I0
148 }{.mii; st8 [rp] = r50 C M23
149 mov ar.ec = r9 C I0
150 mov pr = r10 C I0
152 }{.mib; nop 4711
153 mov ar.pfs = r2 C I0
154 br.ret.sptk.many b0 C B
156 EPILOGUE()