beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / sparc32 / v9 / addmul_1.asm
blob2adf7a8a2f707d49ba1c88b2dacb1954299c11af
1 dnl SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
2 dnl the result to a second limb vector.
4 dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C Algorithm: We use two floating-point multiplies per limb product, with the
35 C invariant v operand split into two 16-bit pieces, and the u operand split
36 C into 32-bit pieces. We convert the two 48-bit products and transfer them to
37 C the integer unit.
39 C cycles/limb
40 C UltraSPARC 1&2: 6.5
41 C UltraSPARC 3: ?
43 C Possible optimizations:
44 C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
45 C memory bandwidth limited, this could save 1.5 cycles/limb.
46 C 2. Unroll the inner loop. Since we already use alternate temporary areas,
47 C it is very straightforward to unroll, using an exit branch midways.
48 C Unrolling would allow deeper scheduling which could improve speed for L2
49 C cache case.
50 C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es
51 C aren't sufficiently apart-scheduled with just two temp areas.
52 C 4. Specialize for particular v values. If its upper 16 bits are zero, we
53 C could save many operations.
55 C INPUT PARAMETERS
56 C rp i0
57 C up i1
58 C n i2
59 C v i3
61 define(`FSIZE',224)
63 ASM_START()
64 PROLOGUE(mpn_addmul_1)
65 add %sp, -FSIZE, %sp
66 sethi %hi(0xffff), %g1
67 srl %o3, 16, %g2
68 or %g1, %lo(0xffff), %g1
69 and %o3, %g1, %g1
70 stx %g1, [%sp+104]
71 stx %g2, [%sp+112]
72 ldd [%sp+104], %f6
73 ldd [%sp+112], %f8
74 fxtod %f6, %f6
75 fxtod %f8, %f8
76 ld [%sp+104], %f10 C zero f10
78 mov 0, %g3 C cy = 0
80 define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe
82 add %sp, 160, %o5 C point in scratch area
83 and %o5, -32, %o5 C align at 0 (mod 32) in scratch area
85 subcc %o2, 1, %o2
86 ld [%o1], %f11 C read up[i]
87 add %o1, 4, %o1 C up++
88 bne,pt %icc, .L_two_or_more
89 fxtod %f10, %f2
91 fmuld %f2, %f8, %f16
92 fmuld %f2, %f6, %f4
93 fdtox %f16, %f14
94 fdtox %f4, %f12
95 std %f14, [%o5+16]
96 std %f12, [%o5+24]
97 ldx [%o5+16], %g2 C p16
98 ldx [%o5+24], %g1 C p0
99 lduw [%o0], %g5 C read rp[i]
100 b .L1
101 add %o0, -16, %o0
103 .align 16
104 .L_two_or_more:
105 subcc %o2, 1, %o2
106 ld [%o1], %f11 C read up[i]
107 fmuld %f2, %f8, %f16
108 fmuld %f2, %f6, %f4
109 add %o1, 4, %o1 C up++
110 bne,pt %icc, .L_three_or_more
111 fxtod %f10, %f2
113 fdtox %f16, %f14
114 fdtox %f4, %f12
115 std %f14, [%o5+16]
116 fmuld %f2, %f8, %f16
117 std %f12, [%o5+24]
118 fmuld %f2, %f6, %f4
119 fdtox %f16, %f14
120 fdtox %f4, %f12
121 std %f14, [%o5+0]
122 std %f12, [%o5+8]
123 lduw [%o0], %g5 C read rp[i]
124 ldx [%o5+16], %g2 C p16
125 ldx [%o5+24], %g1 C p0
126 b .L2
127 add %o0, -12, %o0
129 .align 16
130 .L_three_or_more:
131 subcc %o2, 1, %o2
132 ld [%o1], %f11 C read up[i]
133 fdtox %f16, %f14
134 fdtox %f4, %f12
135 std %f14, [%o5+16]
136 fmuld %f2, %f8, %f16
137 std %f12, [%o5+24]
138 fmuld %f2, %f6, %f4
139 add %o1, 4, %o1 C up++
140 bne,pt %icc, .L_four_or_more
141 fxtod %f10, %f2
143 fdtox %f16, %f14
144 fdtox %f4, %f12
145 std %f14, [%o5+0]
146 fmuld %f2, %f8, %f16
147 std %f12, [%o5+8]
148 fmuld %f2, %f6, %f4
149 fdtox %f16, %f14
150 ldx [%o5+16], %g2 C p16
151 fdtox %f4, %f12
152 ldx [%o5+24], %g1 C p0
153 std %f14, [%o5+16]
154 std %f12, [%o5+24]
155 lduw [%o0], %g5 C read rp[i]
156 b .L3
157 add %o0, -8, %o0
159 .align 16
160 .L_four_or_more:
161 subcc %o2, 1, %o2
162 ld [%o1], %f11 C read up[i]
163 fdtox %f16, %f14
164 fdtox %f4, %f12
165 std %f14, [%o5+0]
166 fmuld %f2, %f8, %f16
167 std %f12, [%o5+8]
168 fmuld %f2, %f6, %f4
169 add %o1, 4, %o1 C up++
170 bne,pt %icc, .L_five_or_more
171 fxtod %f10, %f2
173 fdtox %f16, %f14
174 ldx [%o5+16], %g2 C p16
175 fdtox %f4, %f12
176 ldx [%o5+24], %g1 C p0
177 std %f14, [%o5+16]
178 fmuld %f2, %f8, %f16
179 std %f12, [%o5+24]
180 fmuld %f2, %f6, %f4
181 add %o1, 4, %o1 C up++
182 lduw [%o0], %g5 C read rp[i]
183 b .L4
184 add %o0, -4, %o0
186 .align 16
187 .L_five_or_more:
188 subcc %o2, 1, %o2
189 ld [%o1], %f11 C read up[i]
190 fdtox %f16, %f14
191 ldx [%o5+16], %g2 C p16
192 fdtox %f4, %f12
193 ldx [%o5+24], %g1 C p0
194 std %f14, [%o5+16]
195 fmuld %f2, %f8, %f16
196 std %f12, [%o5+24]
197 fmuld %f2, %f6, %f4
198 add %o1, 4, %o1 C up++
199 lduw [%o0], %g5 C read rp[i]
200 bne,pt %icc, .Loop
201 fxtod %f10, %f2
202 b,a .L5
204 C BEGIN MAIN LOOP
205 .align 16
206 C -- 0
207 .Loop: nop
208 subcc %o2, 1, %o2
209 ld [%o1], %f11 C read up[i]
210 fdtox %f16, %f14
211 C -- 1
212 sllx %g2, 16, %g4 C (p16 << 16)
213 add %o0, 4, %o0 C rp++
214 ldx [%o5+0], %g2 C p16
215 fdtox %f4, %f12
216 C -- 2
218 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
219 ldx [%o5+8], %g1 C p0
220 fanop
221 C -- 3
223 add %g3, %g4, %g4 C p += cy
224 std %f14, [%o5+0]
225 fmuld %f2, %f8, %f16
226 C -- 4
228 add %g5, %g4, %g4 C p += rp[i]
229 std %f12, [%o5+8]
230 fmuld %f2, %f6, %f4
231 C -- 5
232 xor %o5, 16, %o5 C alternate scratch variables
233 add %o1, 4, %o1 C up++
234 stw %g4, [%o0-4]
235 fanop
236 C -- 6
237 srlx %g4, 32, %g3 C new cy
238 lduw [%o0], %g5 C read rp[i]
239 bne,pt %icc, .Loop
240 fxtod %f10, %f2
241 C END MAIN LOOP
243 .L5: fdtox %f16, %f14
244 sllx %g2, 16, %g4 C (p16 << 16)
245 ldx [%o5+0], %g2 C p16
246 fdtox %f4, %f12
247 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
248 ldx [%o5+8], %g1 C p0
249 add %g4, %g3, %g4 C p += cy
250 std %f14, [%o5+0]
251 fmuld %f2, %f8, %f16
252 add %g5, %g4, %g4 C p += rp[i]
253 std %f12, [%o5+8]
254 fmuld %f2, %f6, %f4
255 xor %o5, 16, %o5
256 stw %g4, [%o0+0]
257 srlx %g4, 32, %g3 C new cy
258 lduw [%o0+4], %g5 C read rp[i]
260 .L4: fdtox %f16, %f14
261 sllx %g2, 16, %g4 C (p16 << 16)
262 ldx [%o5+0], %g2 C p16
263 fdtox %f4, %f12
264 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
265 ldx [%o5+8], %g1 C p0
266 add %g3, %g4, %g4 C p += cy
267 std %f14, [%o5+0]
268 add %g5, %g4, %g4 C p += rp[i]
269 std %f12, [%o5+8]
270 xor %o5, 16, %o5
271 stw %g4, [%o0+4]
272 srlx %g4, 32, %g3 C new cy
273 lduw [%o0+8], %g5 C read rp[i]
275 .L3: sllx %g2, 16, %g4 C (p16 << 16)
276 ldx [%o5+0], %g2 C p16
277 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
278 ldx [%o5+8], %g1 C p0
279 add %g3, %g4, %g4 C p += cy
280 add %g5, %g4, %g4 C p += rp[i]
281 xor %o5, 16, %o5
282 stw %g4, [%o0+8]
283 srlx %g4, 32, %g3 C new cy
284 lduw [%o0+12], %g5 C read rp[i]
286 .L2: sllx %g2, 16, %g4 C (p16 << 16)
287 ldx [%o5+0], %g2 C p16
288 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
289 ldx [%o5+8], %g1 C p0
290 add %g3, %g4, %g4 C p += cy
291 add %g5, %g4, %g4 C p += rp[i]
292 stw %g4, [%o0+12]
293 srlx %g4, 32, %g3 C new cy
294 lduw [%o0+16], %g5 C read rp[i]
296 .L1: sllx %g2, 16, %g4 C (p16 << 16)
297 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
298 add %g3, %g4, %g4 C p += cy
299 add %g5, %g4, %g4 C p += rp[i]
300 stw %g4, [%o0+16]
301 srlx %g4, 32, %g3 C new cy
303 mov %g3, %o0
304 retl
305 sub %sp, -FSIZE, %sp
306 EPILOGUE(mpn_addmul_1)