1 dnl IA
-64 mpn_sqr_diag_addlsh1
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright
2010, 2011 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
37 C Itanium 2: 2 Unrolling could bring it to 1.5 + epsilon
39 C Exact performance table. The 2nd line is this code, the 3rd line is ctop-
40 C less code. In an assembly sqr_basecase, the ctop-full numbers will become a
41 C few cycles better since we can mitigate the many I0 instructions.
43 C 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
44 C - 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 Needs updating
45 C - 13 16 17 18 20 21 23 25 26 30 31 31 33 34 36 38 39 42 43
47 C We should keep in mind that this code takes linear time in a O(n^2) context
48 C and that it will only be used under SQR_TOOM2_THRESHOLD, which might become
49 C around 60. Keeping overhead down for smallish operands (< 10) is more
50 C important than optimal cycle counts.
53 C * Make sure we don't depend on uninitialised r
-registers
, f
-registers
, or
55 C
* Optimise by doing first two
loop iterations
in function header.
58 define
(`rp_param
', `r32') define
(`rp
', `r14') C
size: 2n
59 define
(`tp_param
', `r33') define
(`tp
', `r15') C
size: 2n
- 2
60 define
(`up_param
', `r34') define
(`up
', `r31') C
size: n
72 PROLOGUE(mpn_sqr_diag_addlsh1)
79 {.mii; alloc r2 = ar.pfs, 4,24,0,24 C M
83 }{.mmi; ABI64(` mov tp = tp_param ') C M I
84 ABI32
(` addp4 tp
= 0, tp_param
') C M I
85 ABI64(` mov up = up_param ') C M I
86 ABI32
(` addp4 up
= 0, up_param
') C M I
87 ABI64(` mov rp = rp_param ') C M I
88 ABI32
(` addp4 rp
= 0, rp_param
') C M I
90 }{.mmi; ld8 r36 = [tp], 8 C M
94 }{.mmi; ld8 r32 = [tp], 8 C M
104 mov pr.rot = 0x30000 C I0
106 } br.cexit.spnt.few.clr L(end)
108 dnl *** MAIN LOOP START ***
111 {.mfi; (p18) ldf8 f33 = [up], 8 C M
112 (p20) xma.l f36 = f35, f35, f42 C F
113 (p41) cmpequc p50, p0 = -1, r44 C M I
114 }{.mfi; setfsig f40 = r16 C M23
115 (p20) xma.hu f38 = f35, f35, f42 C F
116 (p23) add r50 = r41, r49 C M I
118 }{.mmi; (p16) ld8 r36 = [tp], 8 C M
119 (p23) cmpltu p40, p0 = r50, r41 C cyout hi M I
120 (p19) shrp r45 = r38, r35, 63 C non-critical I0
121 }{.mmi; (p21) getfsig r39 = f39 C hi M2
122 (p24) st8 [rp] = r51, 8 C hi M23
123 (p41) add r44 = 1, r44 C M I
125 }{.mmi; (p16) ld8 r32 = [tp], 8 C M
126 (p50) cmpeqor p40, p0 = -1, r50 C cyout hi M I
127 (p17) shrp r16 = r33, r37, 63 C critical I0
128 }{.mmi; (p21) getfsig r42 = f37 C lo M2
129 (p23) st8 [rp] = r44, 8 C lo M23
130 (p50) add r50 = 1, r50 C M I
132 } br.ctop.sptk.few.clr L(top) C B
133 dnl *** MAIN LOOP END ***
137 (p41) add r44 = 1, r44 C M I
138 shr.u r48 = r39, 63 C I0
140 }{.mmi; st8 [rp] = r51, 8 C M23
141 (p41) cmpequc p6, p0 = 0, r44 C M I
142 add r50 = r41, r48 C M I
144 }{.mmi; st8 [rp] = r44, 8 C M23
145 (p6) add r50 = 1, r50 C M I
148 }{.mii; st8 [rp] = r50 C M23
154 br.ret.sptk.many b0 C B