beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / pa64 / sqr_diagonal.asm
blobf6fadc93c63a095db5fb465cce8be209168414d6
1 dnl HP-PA 2.0 64-bit mpn_sqr_diagonal.
3 dnl Copyright 2001-2003 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
32 dnl This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on
33 dnl PA8500. The cache would saturate at 5 cycles/limb, so there is some room
34 dnl for optimization.
36 include(`../config.m4')
38 C INPUT PARAMETERS
39 define(`rp',`%r26')
40 define(`up',`%r25')
41 define(`n',`%r24')
43 define(`p00',`%r28')
44 define(`p32',`%r29')
45 define(`p64',`%r31')
46 define(`t0',`%r19')
47 define(`t1',`%r20')
49 ifdef(`HAVE_ABI_2_0w',
50 ` .level 2.0w
51 ',` .level 2.0
53 PROLOGUE(mpn_sqr_diagonal)
54 ldo 128(%r30),%r30
56 fldds,ma 8(up),%fr8
57 addib,= -1,n,L(end1)
58 nop
59 fldds,ma 8(up),%fr4
60 xmpyu %fr8l,%fr8r,%fr10
61 fstd %fr10,-120(%r30)
62 xmpyu %fr8r,%fr8r,%fr9
63 fstd %fr9,0(rp)
64 xmpyu %fr8l,%fr8l,%fr11
65 fstd %fr11,8(rp)
66 addib,= -1,n,L(end2)
67 ldo 16(rp),rp
69 LDEF(loop)
70 fldds,ma 8(up),%fr8 C load next up limb
71 xmpyu %fr4l,%fr4r,%fr6
72 fstd %fr6,-128(%r30)
73 xmpyu %fr4r,%fr4r,%fr5 C multiply in fp regs
74 fstd %fr5,0(rp)
75 xmpyu %fr4l,%fr4l,%fr7
76 fstd %fr7,8(rp)
77 ldd -120(%r30),p32
78 ldd -16(rp),p00 C accumulate in int regs
79 ldd -8(rp),p64
80 depd,z p32,30,31,t0
81 add t0,p00,p00
82 std p00,-16(rp)
83 extrd,u p32,32,33,t1
84 add,dc t1,p64,p64
85 std p64,-8(rp)
86 addib,= -1,n,L(exit)
87 ldo 16(rp),rp
89 fldds,ma 8(up),%fr4
90 xmpyu %fr8l,%fr8r,%fr10
91 fstd %fr10,-120(%r30)
92 xmpyu %fr8r,%fr8r,%fr9
93 fstd %fr9,0(rp)
94 xmpyu %fr8l,%fr8l,%fr11
95 fstd %fr11,8(rp)
96 ldd -128(%r30),p32
97 ldd -16(rp),p00
98 ldd -8(rp),p64
99 depd,z p32,30,31,t0
100 add t0,p00,p00
101 std p00,-16(rp)
102 extrd,u p32,32,33,t1
103 add,dc t1,p64,p64
104 std p64,-8(rp)
105 addib,<> -1,n,L(loop)
106 ldo 16(rp),rp
108 LDEF(end2)
109 xmpyu %fr4l,%fr4r,%fr6
110 fstd %fr6,-128(%r30)
111 xmpyu %fr4r,%fr4r,%fr5
112 fstd %fr5,0(rp)
113 xmpyu %fr4l,%fr4l,%fr7
114 fstd %fr7,8(rp)
115 ldd -120(%r30),p32
116 ldd -16(rp),p00
117 ldd -8(rp),p64
118 depd,z p32,30,31,t0
119 add t0,p00,p00
120 std p00,-16(rp)
121 extrd,u p32,32,33,t1
122 add,dc t1,p64,p64
123 std p64,-8(rp)
124 ldo 16(rp),rp
125 ldd -128(%r30),p32
126 ldd -16(rp),p00
127 ldd -8(rp),p64
128 depd,z p32,30,31,t0
129 add t0,p00,p00
130 std p00,-16(rp)
131 extrd,u p32,32,33,t1
132 add,dc t1,p64,p64
133 std p64,-8(rp)
134 bve (%r2)
135 ldo -128(%r30),%r30
137 LDEF(exit)
138 xmpyu %fr8l,%fr8r,%fr10
139 fstd %fr10,-120(%r30)
140 xmpyu %fr8r,%fr8r,%fr9
141 fstd %fr9,0(rp)
142 xmpyu %fr8l,%fr8l,%fr11
143 fstd %fr11,8(rp)
144 ldd -128(%r30),p32
145 ldd -16(rp),p00
146 ldd -8(rp),p64
147 depd,z p32,31,32,t0
148 add t0,p00,p00
149 extrd,u p32,31,32,t1
150 add,dc t1,p64,p64
151 add t0,p00,p00
152 add,dc t1,p64,p64
153 std p00,-16(rp)
154 std p64,-8(rp)
155 ldo 16(rp),rp
156 ldd -120(%r30),p32
157 ldd -16(rp),p00
158 ldd -8(rp),p64
159 depd,z p32,31,32,t0
160 add t0,p00,p00
161 extrd,u p32,31,32,t1
162 add,dc t1,p64,p64
163 add t0,p00,p00
164 add,dc t1,p64,p64
165 std p00,-16(rp)
166 std p64,-8(rp)
167 bve (%r2)
168 ldo -128(%r30),%r30
170 LDEF(end1)
171 xmpyu %fr8l,%fr8r,%fr10
172 fstd %fr10,-128(%r30)
173 xmpyu %fr8r,%fr8r,%fr9
174 fstd %fr9,0(rp)
175 xmpyu %fr8l,%fr8l,%fr11
176 fstd %fr11,8(rp)
177 ldo 16(rp),rp
178 ldd -128(%r30),p32
179 ldd -16(rp),p00
180 ldd -8(rp),p64
181 depd,z p32,31,32,t0
182 add t0,p00,p00
183 extrd,u p32,31,32,t1
184 add,dc t1,p64,p64
185 add t0,p00,p00
186 add,dc t1,p64,p64
187 std p00,-16(rp)
188 std p64,-8(rp)
189 bve (%r2)
190 ldo -128(%r30),%r30
191 EPILOGUE(mpn_sqr_diagonal)