beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / sparc32 / v9 / mul_1.asm
blob40aeffad4f31568415c17448b601cf8038ec5e2a
1 dnl SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2 dnl the result in a second limb vector.
4 dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C Algorithm: We use two floating-point multiplies per limb product, with the
35 C invariant v operand split into two 16-bit pieces, and the u operand split
36 C into 32-bit pieces. We convert the two 48-bit products and transfer them to
37 C the integer unit.
39 C cycles/limb
40 C UltraSPARC 1&2: 6.5
41 C UltraSPARC 3: ?
43 C Possible optimizations:
44 C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
45 C memory bandwidth limited, this could save 1.5 cycles/limb.
46 C 2. Unroll the inner loop. Since we already use alternate temporary areas,
47 C it is very straightforward to unroll, using an exit branch midways.
48 C Unrolling would allow deeper scheduling which could improve speed for L2
49 C cache case.
50 C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es
51 C aren't sufficiently apart-scheduled with just two temp areas.
52 C 4. Specialize for particular v values. If its upper 16 bits are zero, we
53 C could save many operations.
55 C INPUT PARAMETERS
56 C rp i0
57 C up i1
58 C n i2
59 C v i3
61 define(`FSIZE',224)
63 ASM_START()
64 PROLOGUE(mpn_mul_1)
65 add %sp, -FSIZE, %sp
66 sethi %hi(0xffff), %g1
67 srl %o3, 16, %g2
68 or %g1, %lo(0xffff), %g1
69 and %o3, %g1, %g1
70 stx %g1, [%sp+104]
71 stx %g2, [%sp+112]
72 ldd [%sp+104], %f6
73 ldd [%sp+112], %f8
74 fxtod %f6, %f6
75 fxtod %f8, %f8
76 ld [%sp+104], %f10 C zero f10
78 mov 0, %g3 C cy = 0
80 define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe
82 add %sp, 160, %o5 C point in scratch area
83 and %o5, -32, %o5 C align at 0 (mod 32) in scratch area
85 subcc %o2, 1, %o2
86 ld [%o1], %f11 C read up[i]
87 add %o1, 4, %o1 C up++
88 bne,pt %icc, .L_two_or_more
89 fxtod %f10, %f2
91 fmuld %f2, %f8, %f16
92 fmuld %f2, %f6, %f4
93 fdtox %f16, %f14
94 fdtox %f4, %f12
95 std %f14, [%o5+16]
96 std %f12, [%o5+24]
97 ldx [%o5+16], %g2 C p16
98 ldx [%o5+24], %g1 C p0
99 b .L1
100 add %o0, -16, %o0
102 .align 16
103 .L_two_or_more:
104 subcc %o2, 1, %o2
105 ld [%o1], %f11 C read up[i]
106 fmuld %f2, %f8, %f16
107 fmuld %f2, %f6, %f4
108 add %o1, 4, %o1 C up++
109 bne,pt %icc, .L_three_or_more
110 fxtod %f10, %f2
112 fdtox %f16, %f14
113 fdtox %f4, %f12
114 std %f14, [%o5+16]
115 fmuld %f2, %f8, %f16
116 std %f12, [%o5+24]
117 fmuld %f2, %f6, %f4
118 fdtox %f16, %f14
119 fdtox %f4, %f12
120 std %f14, [%o5+0]
121 std %f12, [%o5+8]
122 ldx [%o5+16], %g2 C p16
123 ldx [%o5+24], %g1 C p0
124 b .L2
125 add %o0, -12, %o0
127 .align 16
128 .L_three_or_more:
129 subcc %o2, 1, %o2
130 ld [%o1], %f11 C read up[i]
131 fdtox %f16, %f14
132 fdtox %f4, %f12
133 std %f14, [%o5+16]
134 fmuld %f2, %f8, %f16
135 std %f12, [%o5+24]
136 fmuld %f2, %f6, %f4
137 add %o1, 4, %o1 C up++
138 bne,pt %icc, .L_four_or_more
139 fxtod %f10, %f2
141 fdtox %f16, %f14
142 fdtox %f4, %f12
143 std %f14, [%o5+0]
144 fmuld %f2, %f8, %f16
145 std %f12, [%o5+8]
146 fmuld %f2, %f6, %f4
147 fdtox %f16, %f14
148 ldx [%o5+16], %g2 C p16
149 fdtox %f4, %f12
150 ldx [%o5+24], %g1 C p0
151 std %f14, [%o5+16]
152 std %f12, [%o5+24]
153 b .L3
154 add %o0, -8, %o0
156 .align 16
157 .L_four_or_more:
158 subcc %o2, 1, %o2
159 ld [%o1], %f11 C read up[i]
160 fdtox %f16, %f14
161 fdtox %f4, %f12
162 std %f14, [%o5+0]
163 fmuld %f2, %f8, %f16
164 std %f12, [%o5+8]
165 fmuld %f2, %f6, %f4
166 add %o1, 4, %o1 C up++
167 bne,pt %icc, .L_five_or_more
168 fxtod %f10, %f2
170 fdtox %f16, %f14
171 ldx [%o5+16], %g2 C p16
172 fdtox %f4, %f12
173 ldx [%o5+24], %g1 C p0
174 std %f14, [%o5+16]
175 fmuld %f2, %f8, %f16
176 std %f12, [%o5+24]
177 fmuld %f2, %f6, %f4
178 add %o1, 4, %o1 C up++
179 b .L4
180 add %o0, -4, %o0
182 .align 16
183 .L_five_or_more:
184 subcc %o2, 1, %o2
185 ld [%o1], %f11 C read up[i]
186 fdtox %f16, %f14
187 ldx [%o5+16], %g2 C p16
188 fdtox %f4, %f12
189 ldx [%o5+24], %g1 C p0
190 std %f14, [%o5+16]
191 fmuld %f2, %f8, %f16
192 std %f12, [%o5+24]
193 fmuld %f2, %f6, %f4
194 add %o1, 4, %o1 C up++
195 bne,pt %icc, .Loop
196 fxtod %f10, %f2
197 b,a .L5
199 C BEGIN MAIN LOOP
200 .align 16
201 C -- 0
202 .Loop: nop
203 subcc %o2, 1, %o2
204 ld [%o1], %f11 C read up[i]
205 fdtox %f16, %f14
206 C -- 1
207 sllx %g2, 16, %g4 C (p16 << 16)
208 add %o0, 4, %o0 C rp++
209 ldx [%o5+0], %g2 C p16
210 fdtox %f4, %f12
211 C -- 2
213 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
214 ldx [%o5+8], %g1 C p0
215 fanop
216 C -- 3
218 add %g3, %g4, %g4 C p += cy
219 std %f14, [%o5+0]
220 fmuld %f2, %f8, %f16
221 C -- 4
222 srlx %g4, 32, %g3 C new cy
223 add %o1, 4, %o1 C up++
224 std %f12, [%o5+8]
225 fmuld %f2, %f6, %f4
226 C -- 5
227 xor %o5, 16, %o5 C alternate scratch variables
228 stw %g4, [%o0-4]
229 bne,pt %icc, .Loop
230 fxtod %f10, %f2
231 C END MAIN LOOP
233 .L5: fdtox %f16, %f14
234 sllx %g2, 16, %g4 C (p16 << 16)
235 ldx [%o5+0], %g2 C p16
236 fdtox %f4, %f12
237 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
238 ldx [%o5+8], %g1 C p0
239 add %g4, %g3, %g4 C p += cy
240 std %f14, [%o5+0]
241 fmuld %f2, %f8, %f16
242 std %f12, [%o5+8]
243 fmuld %f2, %f6, %f4
244 xor %o5, 16, %o5
245 stw %g4, [%o0+0]
246 srlx %g4, 32, %g3 C new cy
248 .L4: fdtox %f16, %f14
249 sllx %g2, 16, %g4 C (p16 << 16)
250 ldx [%o5+0], %g2 C p16
251 fdtox %f4, %f12
252 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
253 ldx [%o5+8], %g1 C p0
254 add %g3, %g4, %g4 C p += cy
255 std %f14, [%o5+0]
256 std %f12, [%o5+8]
257 xor %o5, 16, %o5
258 stw %g4, [%o0+4]
259 srlx %g4, 32, %g3 C new cy
261 .L3: sllx %g2, 16, %g4 C (p16 << 16)
262 ldx [%o5+0], %g2 C p16
263 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
264 ldx [%o5+8], %g1 C p0
265 add %g3, %g4, %g4 C p += cy
266 xor %o5, 16, %o5
267 stw %g4, [%o0+8]
268 srlx %g4, 32, %g3 C new cy
270 .L2: sllx %g2, 16, %g4 C (p16 << 16)
271 ldx [%o5+0], %g2 C p16
272 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
273 ldx [%o5+8], %g1 C p0
274 add %g3, %g4, %g4 C p += cy
275 stw %g4, [%o0+12]
276 srlx %g4, 32, %g3 C new cy
278 .L1: sllx %g2, 16, %g4 C (p16 << 16)
279 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
280 add %g3, %g4, %g4 C p += cy
281 stw %g4, [%o0+16]
282 srlx %g4, 32, %g3 C new cy
284 mov %g3, %o0
285 retl
286 sub %sp, -FSIZE, %sp
287 EPILOGUE(mpn_mul_1)