1 dnl SPARC v9
32-bit mpn_sqr_diagonal.
3 dnl Copyright
2001, 2003 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
39 C This code uses a very deep software pipeline, due to the need for moving data
40 C forth and back between the integer registers and floating-point registers.
42 C A VIS variant of this code would make the pipeline less deep, since the
43 C masking now done in the integer unit could take place in the floating-point
44 C unit using the FAND instruction. It would be possible to save several cycles
47 C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
48 C not much slower from the Ecache. It would perhaps be possible to shave off
49 C one cycle, but not easily. We cannot do better than 10 cycles/limb with the
50 C used instructions, since we have 10 memory operations per limb. But a VIS
51 C variant could run three cycles faster than the corresponding non-VIS code.
53 C This is non-pipelined code showing the algorithm:
56 C lduw [up+0],%g4 C 00000000hhhhllll
57 C sllx %g4,16,%g3 C 0000hhhhllll0000
58 C or %g3,%g4,%g2 C 0000hhhhXXXXllll
59 C andn %g2,%g5,%g2 C 0000hhhh0000llll
62 C fitod %f0,%f4 C hi16
63 C fitod %f1,%f6 C lo16
84 define(`fanop',`fitod
%f12
,%f10
') dnl A quasi nop running in the FA pipe
93 PROLOGUE(mpn_sqr_diagonal)
98 ld
[%o7
+.Lnoll
-.Lpc
],%f8
',
99 ` sethi %hi(.Lnoll),%g1
100 ld [%g1+%lo(.Lnoll)],%f8')
102 sethi
%hi
(0xffff0000),%g5
106 add %i1
,4,%i1 C s1_ptr
++
107 sllx
%g4
,16,%g3 C
0000hhhhllll0000
108 or %g3
,%g4
,%g2 C
0000hhhhXXXXllll
111 andn
%g2
,%g5
,%g2 C
0000hhhh0000llll
113 add %i1
,4,%i1 C s1_ptr
++
136 add %i1
,4,%i1 C s1_ptr
++
137 sllx
%g4
,16,%g3 C
0000hhhhllll0000
138 or %g3
,%g4
,%g2 C
0000hhhhXXXXllll
141 andn
%g2
,%g5
,%g2 C
0000hhhh0000llll
145 add %i1
,4,%i1 C s1_ptr
++
175 add %i1
,4,%i1 C s1_ptr
++
177 sllx
%g4
,16,%g3 C
0000hhhhllll0000
178 or %g3
,%g4
,%g2 C
0000hhhhXXXXllll
182 andn
%g2
,%g5
,%g2 C
0000hhhh0000llll
190 add %i1
,4,%i1 C s1_ptr
++
218 add %i1
,4,%i1 C s1_ptr
++
221 sllx
%g4
,16,%g3 C
0000hhhhllll0000
223 or %g3
,%g4
,%g2 C
0000hhhhXXXXllll
229 andn
%g2
,%g5
,%g2 C
0000hhhh0000llll
239 add %i1
,4,%i1 C s1_ptr
++
254 add %i1
,4,%i1 C s1_ptr
++
257 sllx
%g4
,16,%g3 C
0000hhhhllll0000
259 or %g3
,%g4
,%g2 C
0000hhhhXXXXllll
265 andn
%g2
,%g5
,%g2 C
0000hhhh0000llll
283 ldx
[%fp
-24],%g2 C p16
288 ldx
[%fp
-16],%g1 C p0
291 sllx
%g2
,16,%g2 C
align p16
292 add %i0
,8,%i0 C res_ptr
++
296 add %g2
,%g1
,%g1 C
add p16 to p0
(ADD1
)
297 add %i1
,4,%i1 C s1_ptr
++
306 sllx
%g4
,16,%g3 C
0000hhhhllll0000
311 or %g3
,%g4
,%g2 C
0000hhhhXXXXllll
317 andn
%g2
,%g5
,%g2 C
0000hhhh0000llll
333 ldx
[%fp
-40],%g2 C p16
338 ldx
[%fp
-32],%g1 C p0
341 sllx
%g2
,16,%g2 C
align p16
342 add %i0
,8,%i0 C res_ptr
++
346 add %g2
,%g1
,%g1 C
add p16 to p0
(ADD1
)
347 add %i1
,4,%i1 C s1_ptr
++
356 sllx
%g4
,16,%g3 C
0000hhhhllll0000
361 or %g3
,%g4
,%g2 C
0000hhhhXXXXllll
367 andn
%g2
,%g5
,%g2 C
0000hhhh0000llll
372 .
L5: add %fp
, 80, %l3
378 .
Lend: add %fp
, 72, %l3
382 .
Ltail: stx
%g2
,[%l3
]
388 sllx
%g2
,16,%g2 C
align p16
389 add %i0
,8,%i0 C res_ptr
++
392 add %g2
,%g1
,%g1 C
add p16 to p0
(ADD1
)
393 add %i1
,4,%i1 C s1_ptr
++
409 sllx
%g2
,16,%g2 C
align p16
410 add %i0
,8,%i0 C res_ptr
++
413 add %g2
,%g1
,%g1 C
add p16 to p0
(ADD1
)
429 sllx
%g2
,16,%g2 C
align p16
430 add %i0
,8,%i0 C res_ptr
++
432 add %g2
,%g1
,%g1 C
add p16 to p0
(ADD1
)
443 sllx
%g2
,16,%g2 C
align p16
444 add %i0
,8,%i0 C res_ptr
++
445 add %g2
,%g1
,%g1 C
add p16 to p0
(ADD1
)
450 .
L1: ldx
[%l4
],%g2 C p16
452 sllx
%g2
,16,%g2 C
align p16
453 add %i0
,8,%i0 C res_ptr
++
454 add %g2
,%g1
,%g1 C
add p16 to p0
(ADD1
)
462 EPILOGUE
(mpn_sqr_diagonal
)