beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / sparc32 / v9 / sqr_diagonal.asm
blobe0242798496a15a36977e7980123aec54deb5471
1 dnl SPARC v9 32-bit mpn_sqr_diagonal.
3 dnl Copyright 2001, 2003 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C INPUT PARAMETERS
35 C rp i0
36 C up i1
37 C n i2
39 C This code uses a very deep software pipeline, due to the need for moving data
40 C forth and back between the integer registers and floating-point registers.
42 C A VIS variant of this code would make the pipeline less deep, since the
43 C masking now done in the integer unit could take place in the floating-point
44 C unit using the FAND instruction. It would be possible to save several cycles
45 C too.
47 C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
48 C not much slower from the Ecache. It would perhaps be possible to shave off
49 C one cycle, but not easily. We cannot do better than 10 cycles/limb with the
50 C used instructions, since we have 10 memory operations per limb. But a VIS
51 C variant could run three cycles faster than the corresponding non-VIS code.
53 C This is non-pipelined code showing the algorithm:
55 C .Loop:
56 C lduw [up+0],%g4 C 00000000hhhhllll
57 C sllx %g4,16,%g3 C 0000hhhhllll0000
58 C or %g3,%g4,%g2 C 0000hhhhXXXXllll
59 C andn %g2,%g5,%g2 C 0000hhhh0000llll
60 C stx %g2,[%fp+80]
61 C ldd [%fp+80],%f0
62 C fitod %f0,%f4 C hi16
63 C fitod %f1,%f6 C lo16
64 C ld [up+0],%f9
65 C fxtod %f8,%f2
66 C fmuld %f2,%f4,%f4
67 C fmuld %f2,%f6,%f6
68 C fdtox %f4,%f4
69 C fdtox %f6,%f6
70 C std %f4,[%fp-24]
71 C std %f6,[%fp-16]
72 C ldx [%fp-24],%g2
73 C ldx [%fp-16],%g1
74 C sllx %g2,16,%g2
75 C add %g2,%g1,%g1
76 C stw %g1,[rp+0]
77 C srlx %g1,32,%l0
78 C stw %l0,[rp+4]
79 C add up,4,up
80 C subcc n,1,n
81 C bne,pt %icc,.Loop
82 C add rp,8,rp
84 define(`fanop',`fitod %f12,%f10') dnl A quasi nop running in the FA pipe
86 ASM_START()
88 TEXT
89 ALIGN(4)
90 .Lnoll:
91 .word 0
93 PROLOGUE(mpn_sqr_diagonal)
94 save %sp,-256,%sp
96 ifdef(`PIC',
97 `.Lpc: rd %pc,%o7
98 ld [%o7+.Lnoll-.Lpc],%f8',
99 ` sethi %hi(.Lnoll),%g1
100 ld [%g1+%lo(.Lnoll)],%f8')
102 sethi %hi(0xffff0000),%g5
103 add %i1,-8,%i1
105 lduw [%i1+8],%g4
106 add %i1,4,%i1 C s1_ptr++
107 sllx %g4,16,%g3 C 0000hhhhllll0000
108 or %g3,%g4,%g2 C 0000hhhhXXXXllll
109 subcc %i2,1,%i2
110 bne,pt %icc,.L_grt_1
111 andn %g2,%g5,%g2 C 0000hhhh0000llll
113 add %i1,4,%i1 C s1_ptr++
114 stx %g2,[%fp+80]
115 ld [%i1],%f9
116 ldd [%fp+80],%f0
117 fxtod %f8,%f2
118 fitod %f0,%f4
119 fitod %f1,%f6
120 fmuld %f2,%f4,%f4
121 fmuld %f2,%f6,%f6
122 fdtox %f4,%f4
123 fdtox %f6,%f6
124 std %f4,[%fp-24]
125 std %f6,[%fp-16]
127 add %fp, 80, %l3
128 add %fp, -24, %l4
129 add %fp, 72, %l5
130 b .L1
131 add %fp, -40, %l6
133 .L_grt_1:
134 stx %g2,[%fp+80]
135 lduw [%i1+8],%g4
136 add %i1,4,%i1 C s1_ptr++
137 sllx %g4,16,%g3 C 0000hhhhllll0000
138 or %g3,%g4,%g2 C 0000hhhhXXXXllll
139 subcc %i2,1,%i2
140 bne,pt %icc,.L_grt_2
141 andn %g2,%g5,%g2 C 0000hhhh0000llll
143 stx %g2,[%fp+72]
144 ld [%i1],%f9
145 add %i1,4,%i1 C s1_ptr++
146 ldd [%fp+80],%f0
147 fxtod %f8,%f2
148 fitod %f0,%f4
149 fitod %f1,%f6
150 fmuld %f2,%f4,%f4
151 ld [%i1],%f9
152 fmuld %f2,%f6,%f6
153 ldd [%fp+72],%f0
154 fdtox %f4,%f4
155 fdtox %f6,%f6
156 std %f4,[%fp-24]
157 fxtod %f8,%f2
158 std %f6,[%fp-16]
159 fitod %f0,%f4
160 fitod %f1,%f6
161 fmuld %f2,%f4,%f4
162 fmuld %f2,%f6,%f6
163 fdtox %f4,%f4
165 add %fp, 72, %l3
166 add %fp, -40, %l4
167 add %fp, 80, %l5
168 b .L2
169 add %fp, -24, %l6
171 .L_grt_2:
172 stx %g2,[%fp+72]
173 lduw [%i1+8],%g4
174 ld [%i1],%f9
175 add %i1,4,%i1 C s1_ptr++
176 ldd [%fp+80],%f0
177 sllx %g4,16,%g3 C 0000hhhhllll0000
178 or %g3,%g4,%g2 C 0000hhhhXXXXllll
179 subcc %i2,1,%i2
180 fxtod %f8,%f2
181 bne,pt %icc,.L_grt_3
182 andn %g2,%g5,%g2 C 0000hhhh0000llll
184 stx %g2,[%fp+80]
185 fitod %f0,%f4
186 fitod %f1,%f6
187 fmuld %f2,%f4,%f4
188 ld [%i1],%f9
189 fmuld %f2,%f6,%f6
190 add %i1,4,%i1 C s1_ptr++
191 ldd [%fp+72],%f0
192 fdtox %f4,%f4
193 fdtox %f6,%f6
194 std %f4,[%fp-24]
195 fxtod %f8,%f2
196 std %f6,[%fp-16]
197 fitod %f0,%f4
198 fitod %f1,%f6
199 fmuld %f2,%f4,%f4
200 ld [%i1],%f9
201 add %fp, 80, %l3
202 fmuld %f2,%f6,%f6
203 add %fp, -24, %l4
204 ldd [%fp+80],%f0
205 add %fp, 72, %l5
206 fdtox %f4,%f4
207 b .L3
208 add %fp, -40, %l6
210 .L_grt_3:
211 stx %g2,[%fp+80]
212 fitod %f0,%f4
213 lduw [%i1+8],%g4
214 fitod %f1,%f6
215 fmuld %f2,%f4,%f4
216 ld [%i1],%f9
217 fmuld %f2,%f6,%f6
218 add %i1,4,%i1 C s1_ptr++
219 ldd [%fp+72],%f0
220 fdtox %f4,%f4
221 sllx %g4,16,%g3 C 0000hhhhllll0000
222 fdtox %f6,%f6
223 or %g3,%g4,%g2 C 0000hhhhXXXXllll
224 subcc %i2,1,%i2
225 std %f4,[%fp-24]
226 fxtod %f8,%f2
227 std %f6,[%fp-16]
228 bne,pt %icc,.L_grt_4
229 andn %g2,%g5,%g2 C 0000hhhh0000llll
231 stx %g2,[%fp+72]
232 fitod %f0,%f4
233 fitod %f1,%f6
234 add %fp, 72, %l3
235 fmuld %f2,%f4,%f4
236 add %fp, -40, %l4
237 ld [%i1],%f9
238 fmuld %f2,%f6,%f6
239 add %i1,4,%i1 C s1_ptr++
240 ldd [%fp+80],%f0
241 add %fp, 80, %l5
242 fdtox %f4,%f4
243 b .L4
244 add %fp, -24, %l6
246 .L_grt_4:
247 stx %g2,[%fp+72]
248 fitod %f0,%f4
249 lduw [%i1+8],%g4
250 fitod %f1,%f6
251 fmuld %f2,%f4,%f4
252 ld [%i1],%f9
253 fmuld %f2,%f6,%f6
254 add %i1,4,%i1 C s1_ptr++
255 ldd [%fp+80],%f0
256 fdtox %f4,%f4
257 sllx %g4,16,%g3 C 0000hhhhllll0000
258 fdtox %f6,%f6
259 or %g3,%g4,%g2 C 0000hhhhXXXXllll
260 subcc %i2,1,%i2
261 std %f4,[%fp-40]
262 fxtod %f8,%f2
263 std %f6,[%fp-32]
264 be,pn %icc,.L5
265 andn %g2,%g5,%g2 C 0000hhhh0000llll
267 b,a .Loop
269 .align 16
270 C --- LOOP BEGIN
271 .Loop: nop
273 stx %g2,[%fp+80]
274 fitod %f0,%f4
275 C ---
278 lduw [%i1+8],%g4
279 fitod %f1,%f6
280 C ---
283 ldx [%fp-24],%g2 C p16
284 fanop
285 C ---
288 ldx [%fp-16],%g1 C p0
289 fmuld %f2,%f4,%f4
290 C ---
291 sllx %g2,16,%g2 C align p16
292 add %i0,8,%i0 C res_ptr++
293 ld [%i1],%f9
294 fmuld %f2,%f6,%f6
295 C ---
296 add %g2,%g1,%g1 C add p16 to p0 (ADD1)
297 add %i1,4,%i1 C s1_ptr++
298 ldd [%fp+72],%f0
299 fanop
300 C ---
301 srlx %g1,32,%l0
303 stw %g1,[%i0-8]
304 fdtox %f4,%f4
305 C ---
306 sllx %g4,16,%g3 C 0000hhhhllll0000
308 stw %l0,[%i0-4]
309 fdtox %f6,%f6
310 C ---
311 or %g3,%g4,%g2 C 0000hhhhXXXXllll
312 subcc %i2,1,%i2
313 std %f4,[%fp-24]
314 fxtod %f8,%f2
315 C ---
316 std %f6,[%fp-16]
317 andn %g2,%g5,%g2 C 0000hhhh0000llll
318 be,pn %icc,.Lend
319 fanop
320 C --- LOOP MIDDLE
323 stx %g2,[%fp+72]
324 fitod %f0,%f4
325 C ---
328 lduw [%i1+8],%g4
329 fitod %f1,%f6
330 C ---
333 ldx [%fp-40],%g2 C p16
334 fanop
335 C ---
338 ldx [%fp-32],%g1 C p0
339 fmuld %f2,%f4,%f4
340 C ---
341 sllx %g2,16,%g2 C align p16
342 add %i0,8,%i0 C res_ptr++
343 ld [%i1],%f9
344 fmuld %f2,%f6,%f6
345 C ---
346 add %g2,%g1,%g1 C add p16 to p0 (ADD1)
347 add %i1,4,%i1 C s1_ptr++
348 ldd [%fp+80],%f0
349 fanop
350 C ---
351 srlx %g1,32,%l0
353 stw %g1,[%i0-8]
354 fdtox %f4,%f4
355 C ---
356 sllx %g4,16,%g3 C 0000hhhhllll0000
358 stw %l0,[%i0-4]
359 fdtox %f6,%f6
360 C ---
361 or %g3,%g4,%g2 C 0000hhhhXXXXllll
362 subcc %i2,1,%i2
363 std %f4,[%fp-40]
364 fxtod %f8,%f2
365 C ---
366 std %f6,[%fp-32]
367 andn %g2,%g5,%g2 C 0000hhhh0000llll
368 bne,pt %icc,.Loop
369 fanop
370 C --- LOOP END
372 .L5: add %fp, 80, %l3
373 add %fp, -24, %l4
374 add %fp, 72, %l5
375 b .Ltail
376 add %fp, -40, %l6
378 .Lend: add %fp, 72, %l3
379 add %fp, -40, %l4
380 add %fp, 80, %l5
381 add %fp, -24, %l6
382 .Ltail: stx %g2,[%l3]
383 fitod %f0,%f4
384 fitod %f1,%f6
385 ldx [%l4],%g2 C p16
386 ldx [%l4+8],%g1 C p0
387 fmuld %f2,%f4,%f4
388 sllx %g2,16,%g2 C align p16
389 add %i0,8,%i0 C res_ptr++
390 ld [%i1],%f9
391 fmuld %f2,%f6,%f6
392 add %g2,%g1,%g1 C add p16 to p0 (ADD1)
393 add %i1,4,%i1 C s1_ptr++
394 ldd [%l5],%f0
395 srlx %g1,32,%l0
396 stw %g1,[%i0-8]
397 fdtox %f4,%f4
398 stw %l0,[%i0-4]
399 .L4: fdtox %f6,%f6
400 std %f4,[%l4]
401 fxtod %f8,%f2
402 std %f6,[%l4+8]
404 fitod %f0,%f4
405 fitod %f1,%f6
406 ldx [%l6],%g2 C p16
407 ldx [%l6+8],%g1 C p0
408 fmuld %f2,%f4,%f4
409 sllx %g2,16,%g2 C align p16
410 add %i0,8,%i0 C res_ptr++
411 ld [%i1],%f9
412 fmuld %f2,%f6,%f6
413 add %g2,%g1,%g1 C add p16 to p0 (ADD1)
414 ldd [%l3],%f0
415 srlx %g1,32,%l0
416 stw %g1,[%i0-8]
417 fdtox %f4,%f4
418 stw %l0,[%i0-4]
419 .L3: fdtox %f6,%f6
420 std %f4,[%l6]
421 fxtod %f8,%f2
422 std %f6,[%l6+8]
424 fitod %f0,%f4
425 fitod %f1,%f6
426 ldx [%l4],%g2 C p16
427 ldx [%l4+8],%g1 C p0
428 fmuld %f2,%f4,%f4
429 sllx %g2,16,%g2 C align p16
430 add %i0,8,%i0 C res_ptr++
431 fmuld %f2,%f6,%f6
432 add %g2,%g1,%g1 C add p16 to p0 (ADD1)
433 srlx %g1,32,%l0
434 stw %g1,[%i0-8]
435 fdtox %f4,%f4
436 stw %l0,[%i0-4]
437 .L2: fdtox %f6,%f6
438 std %f4,[%l4]
439 std %f6,[%l4+8]
441 ldx [%l6],%g2 C p16
442 ldx [%l6+8],%g1 C p0
443 sllx %g2,16,%g2 C align p16
444 add %i0,8,%i0 C res_ptr++
445 add %g2,%g1,%g1 C add p16 to p0 (ADD1)
446 srlx %g1,32,%l0
447 stw %g1,[%i0-8]
448 stw %l0,[%i0-4]
450 .L1: ldx [%l4],%g2 C p16
451 ldx [%l4+8],%g1 C p0
452 sllx %g2,16,%g2 C align p16
453 add %i0,8,%i0 C res_ptr++
454 add %g2,%g1,%g1 C add p16 to p0 (ADD1)
455 srlx %g1,32,%l0
456 stw %g1,[%i0-8]
457 stw %l0,[%i0-4]
460 restore %g0,%g0,%o0
462 EPILOGUE(mpn_sqr_diagonal)