beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / sparc32 / v9 / submul_1.asm
blob92d0ce7db9ea1086d3db53a092a7f7f70a6447c3
1 dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
2 dnl subtract the result from a second limb vector.
4 dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C Algorithm: We use two floating-point multiplies per limb product, with the
35 C invariant v operand split into two 16-bit pieces, and the u operand split
36 C into 32-bit pieces. We convert the two 48-bit products and transfer them to
37 C the integer unit.
39 C cycles/limb
40 C UltraSPARC 1&2: 6.5
41 C UltraSPARC 3: ?
43 C Possible optimizations:
44 C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
45 C memory bandwidth limited, this could save 1.5 cycles/limb.
46 C 2. Unroll the inner loop. Since we already use alternate temporary areas,
47 C it is very straightforward to unroll, using an exit branch midways.
48 C Unrolling would allow deeper scheduling which could improve speed for L2
49 C cache case.
50 C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es
51 C aren't sufficiently apart-scheduled with just two temp areas.
52 C 4. Specialize for particular v values. If its upper 16 bits are zero, we
53 C could save many operations.
55 C INPUT PARAMETERS
56 C rp i0
57 C up i1
58 C n i2
59 C v i3
61 define(`FSIZE',224)
63 ASM_START()
64 PROLOGUE(mpn_submul_1)
65 add %sp, -FSIZE, %sp
66 sethi %hi(0xffff), %g1
67 srl %o3, 16, %g2
68 or %g1, %lo(0xffff), %g1
69 and %o3, %g1, %g1
70 stx %g1, [%sp+104]
71 stx %g2, [%sp+112]
72 ldd [%sp+104], %f6
73 ldd [%sp+112], %f8
74 fxtod %f6, %f6
75 fxtod %f8, %f8
76 ld [%sp+104], %f10 C zero f10
78 mov 0, %g3 C cy = 0
80 define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe
82 add %sp, 160, %o5 C point in scratch area
83 and %o5, -32, %o5 C align at 0 (mod 32) in scratch area
85 subcc %o2, 1, %o2
86 ld [%o1], %f11 C read up[i]
87 add %o1, 4, %o1 C up++
88 bne,pt %icc, .L_two_or_more
89 fxtod %f10, %f2
91 fmuld %f2, %f8, %f16
92 fmuld %f2, %f6, %f4
93 fdtox %f16, %f14
94 fdtox %f4, %f12
95 std %f14, [%o5+16]
96 std %f12, [%o5+24]
97 ldx [%o5+16], %g2 C p16
98 ldx [%o5+24], %g1 C p0
99 lduw [%o0], %g5 C read rp[i]
100 b .L1
101 add %o0, -16, %o0
103 .align 16
104 .L_two_or_more:
105 subcc %o2, 1, %o2
106 ld [%o1], %f11 C read up[i]
107 fmuld %f2, %f8, %f16
108 fmuld %f2, %f6, %f4
109 add %o1, 4, %o1 C up++
110 bne,pt %icc, .L_three_or_more
111 fxtod %f10, %f2
113 fdtox %f16, %f14
114 fdtox %f4, %f12
115 std %f14, [%o5+16]
116 fmuld %f2, %f8, %f16
117 std %f12, [%o5+24]
118 fmuld %f2, %f6, %f4
119 fdtox %f16, %f14
120 fdtox %f4, %f12
121 std %f14, [%o5+0]
122 std %f12, [%o5+8]
123 lduw [%o0], %g5 C read rp[i]
124 ldx [%o5+16], %g2 C p16
125 ldx [%o5+24], %g1 C p0
126 b .L2
127 add %o0, -12, %o0
129 .align 16
130 .L_three_or_more:
131 subcc %o2, 1, %o2
132 ld [%o1], %f11 C read up[i]
133 fdtox %f16, %f14
134 fdtox %f4, %f12
135 std %f14, [%o5+16]
136 fmuld %f2, %f8, %f16
137 std %f12, [%o5+24]
138 fmuld %f2, %f6, %f4
139 add %o1, 4, %o1 C up++
140 bne,pt %icc, .L_four_or_more
141 fxtod %f10, %f2
143 fdtox %f16, %f14
144 fdtox %f4, %f12
145 std %f14, [%o5+0]
146 fmuld %f2, %f8, %f16
147 std %f12, [%o5+8]
148 fmuld %f2, %f6, %f4
149 fdtox %f16, %f14
150 ldx [%o5+16], %g2 C p16
151 fdtox %f4, %f12
152 ldx [%o5+24], %g1 C p0
153 std %f14, [%o5+16]
154 std %f12, [%o5+24]
155 lduw [%o0], %g5 C read rp[i]
156 b .L3
157 add %o0, -8, %o0
159 .align 16
160 .L_four_or_more:
161 subcc %o2, 1, %o2
162 ld [%o1], %f11 C read up[i]
163 fdtox %f16, %f14
164 fdtox %f4, %f12
165 std %f14, [%o5+0]
166 fmuld %f2, %f8, %f16
167 std %f12, [%o5+8]
168 fmuld %f2, %f6, %f4
169 add %o1, 4, %o1 C up++
170 bne,pt %icc, .L_five_or_more
171 fxtod %f10, %f2
173 fdtox %f16, %f14
174 ldx [%o5+16], %g2 C p16
175 fdtox %f4, %f12
176 ldx [%o5+24], %g1 C p0
177 std %f14, [%o5+16]
178 fmuld %f2, %f8, %f16
179 std %f12, [%o5+24]
180 fmuld %f2, %f6, %f4
181 add %o1, 4, %o1 C up++
182 lduw [%o0], %g5 C read rp[i]
183 b .L4
184 add %o0, -4, %o0
186 .align 16
187 .L_five_or_more:
188 subcc %o2, 1, %o2
189 ld [%o1], %f11 C read up[i]
190 fdtox %f16, %f14
191 ldx [%o5+16], %g2 C p16
192 fdtox %f4, %f12
193 ldx [%o5+24], %g1 C p0
194 std %f14, [%o5+16]
195 fmuld %f2, %f8, %f16
196 std %f12, [%o5+24]
197 fmuld %f2, %f6, %f4
198 add %o1, 4, %o1 C up++
199 lduw [%o0], %g5 C read rp[i]
200 bne,pt %icc, .Loop
201 fxtod %f10, %f2
202 b,a .L5
204 C BEGIN MAIN LOOP
205 .align 16
206 C -- 0
207 .Loop: sub %g0, %g3, %g3
208 subcc %o2, 1, %o2
209 ld [%o1], %f11 C read up[i]
210 fdtox %f16, %f14
211 C -- 1
212 sllx %g2, 16, %g4 C (p16 << 16)
213 add %o0, 4, %o0 C rp++
214 ldx [%o5+0], %g2 C p16
215 fdtox %f4, %f12
216 C -- 2
217 srl %g3, 0, %g3 C zero most significant 32 bits
218 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
219 ldx [%o5+8], %g1 C p0
220 fanop
221 C -- 3
223 add %g3, %g4, %g4 C p += cy
224 std %f14, [%o5+0]
225 fmuld %f2, %f8, %f16
226 C -- 4
228 sub %g5, %g4, %g4 C p += rp[i]
229 std %f12, [%o5+8]
230 fmuld %f2, %f6, %f4
231 C -- 5
232 xor %o5, 16, %o5 C alternate scratch variables
233 add %o1, 4, %o1 C up++
234 stw %g4, [%o0-4]
235 fanop
236 C -- 6
237 srlx %g4, 32, %g3 C new cy
238 lduw [%o0], %g5 C read rp[i]
239 bne,pt %icc, .Loop
240 fxtod %f10, %f2
241 C END MAIN LOOP
243 .L5: sub %g0, %g3, %g3
244 fdtox %f16, %f14
245 sllx %g2, 16, %g4 C (p16 << 16)
246 ldx [%o5+0], %g2 C p16
247 fdtox %f4, %f12
248 srl %g3, 0, %g3 C zero most significant 32 bits
249 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
250 ldx [%o5+8], %g1 C p0
251 add %g4, %g3, %g4 C p += cy
252 std %f14, [%o5+0]
253 fmuld %f2, %f8, %f16
254 sub %g5, %g4, %g4 C p += rp[i]
255 std %f12, [%o5+8]
256 fmuld %f2, %f6, %f4
257 xor %o5, 16, %o5
258 stw %g4, [%o0+0]
259 srlx %g4, 32, %g3 C new cy
260 lduw [%o0+4], %g5 C read rp[i]
262 sub %g0, %g3, %g3
263 .L4: fdtox %f16, %f14
264 sllx %g2, 16, %g4 C (p16 << 16)
265 ldx [%o5+0], %g2 C p16
266 fdtox %f4, %f12
267 srl %g3, 0, %g3 C zero most significant 32 bits
268 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
269 ldx [%o5+8], %g1 C p0
270 add %g3, %g4, %g4 C p += cy
271 std %f14, [%o5+0]
272 sub %g5, %g4, %g4 C p += rp[i]
273 std %f12, [%o5+8]
274 xor %o5, 16, %o5
275 stw %g4, [%o0+4]
276 srlx %g4, 32, %g3 C new cy
277 lduw [%o0+8], %g5 C read rp[i]
279 sub %g0, %g3, %g3
280 .L3: sllx %g2, 16, %g4 C (p16 << 16)
281 ldx [%o5+0], %g2 C p16
282 srl %g3, 0, %g3 C zero most significant 32 bits
283 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
284 ldx [%o5+8], %g1 C p0
285 add %g3, %g4, %g4 C p += cy
286 sub %g5, %g4, %g4 C p += rp[i]
287 xor %o5, 16, %o5
288 stw %g4, [%o0+8]
289 srlx %g4, 32, %g3 C new cy
290 lduw [%o0+12], %g5 C read rp[i]
292 sub %g0, %g3, %g3
293 .L2: sllx %g2, 16, %g4 C (p16 << 16)
294 ldx [%o5+0], %g2 C p16
295 srl %g3, 0, %g3 C zero most significant 32 bits
296 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
297 ldx [%o5+8], %g1 C p0
298 add %g3, %g4, %g4 C p += cy
299 sub %g5, %g4, %g4 C p += rp[i]
300 stw %g4, [%o0+12]
301 srlx %g4, 32, %g3 C new cy
302 lduw [%o0+16], %g5 C read rp[i]
304 sub %g0, %g3, %g3
305 .L1: sllx %g2, 16, %g4 C (p16 << 16)
306 srl %g3, 0, %g3 C zero most significant 32 bits
307 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
308 add %g3, %g4, %g4 C p += cy
309 sub %g5, %g4, %g4 C p += rp[i]
310 stw %g4, [%o0+16]
311 srlx %g4, 32, %g3 C new cy
313 sub %g0, %g3, %o0
314 retl
315 sub %sp, -FSIZE, %sp
316 EPILOGUE(mpn_submul_1)