1 dnl Alpha ev6 mpn_add_n
-- Add two limb vectors of the same
length > 0 and
2 dnl store sum
in a third limb vector.
4 dnl Copyright
2000, 2003, 2005 Free Software Foundation
, Inc.
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
44 C cy r20 (for mpn_add_nc)
47 C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
48 C Use multi-pronged feed-in.
49 C Perform additional micro-tuning
51 C This code was written in cooperation with ev6 pipeline expert Steve Root.
53 C Pair loads and stores where possible
54 C Store pairs oct-aligned where possible (didn't need it here
)
55 C Stores are delayed every third cycle
56 C Loads
and stores are delayed by fills
57 C U stays still
, put code there where possible
(note alternation of U1
and U0
)
58 C L moves because of loads
and stores
59 C Note dampers
in L to limit damage
61 C
This odd
-looking optimization expects that were having random bits
in our
62 C data
, so that a pure zero result is unlikely. so we penalize the unlikely
63 C case to help the common case.
65 define
(`u0
', `r0') define
(`u1
', `r3')
66 define
(`v0
', `r1') define
(`v1
', `r4')
68 define
(`cy0
', `r20') define
(`cy1
', `r21')
70 MULFUNC_PROLOGUE
(mpn_add_n mpn_add_nc
)
77 bis r31
, r31
, cy0 C clear carry
in
78 $entry
: cmpult r19
, 5, r22 C L1 move counter
79 ldq u1
, 0(r17
) C L0 get next ones
83 ldq u0
, 8(r17
) C L0 get next ones
85 addq u1
, v1
, r5 C U0
add two data
87 cmpult r5
, v1
, r23 C U0 did it carry
88 ldq u1
, 16(r17
) C L0 get next ones
91 addq u0
, v0
, r8 C U1
add two data
92 addq r5
, cy0
, r5 C U0 carry
in
94 cmpult r8
, v0
, r22 C U1 did it carry
95 beq r5
, $fix
5f C U0 fix exact zero
96 $
ret5f: ldq u0
, 24(r17
) C L0 get next ones
99 addq r8
, r23
, r8 C U1 carry from last
100 addq u1
, v1
, r7 C U0
add two data
102 beq r8
, $fix
6f C U1 fix exact zero
103 $
ret6f: cmpult r7
, v1
, r23 C U0 did it carry
104 ldq u1
, 32(r17
) C L0 get next ones
107 lda r17
, 40(r17
) C L0 move pointer
108 lda r18
, 40(r18
) C L1 move pointer
111 lda r19
, -13(r19
) C L1 move counter
112 blt r19
, $Lend C U1
loop control
115 C Main
loop.
8-way unrolled.
117 $
Loop: addq u0
, v0
, r2 C U1
add two data
118 addq r7
, r22
, r7 C U0
add in carry
119 stq r5
, 8(r16
) C L0 put an answer
120 stq r8
, 16(r16
) C L1 pair
122 cmpult r2
, v0
, cy1 C U1 did it carry
123 beq r7
, $fix
7 C U0 fix exact
0
124 $
ret7: ldq u0
, 0(r17
) C L0 get next ones
127 bis r31
, r31
, r31 C L damp
out
128 addq r2
, r23
, r2 C U1 carry from last
129 bis r31
, r31
, r31 C L moves
in L
!
130 addq u1
, v1
, r5 C U0
add two data
132 beq r2
, $fix
0 C U1 fix exact zero
133 $
ret0: cmpult r5
, v1
, cy0 C U0 did it carry
134 ldq u1
, 8(r17
) C L0 get next ones
137 addq u0
, v0
, r8 C U1
add two data
138 addq r5
, cy1
, r5 C U0 carry from last
139 stq r7
, 24(r16
) C L0 store pair
142 cmpult r8
, v0
, r22 C U1 did it carry
143 beq r5
, $fix
1 C U0 fix exact zero
144 $
ret1: ldq u0
, 16(r17
) C L0 get next ones
147 lda r16
, 64(r16
) C L0 move pointer
148 addq r8
, cy0
, r8 C U1 carry from last
149 lda r19
, -8(r19
) C L1 move counter
150 addq u1
, v1
, r7 C U0
add two data
152 beq r8
, $fix
2 C U1 fix exact zero
153 $
ret2: cmpult r7
, v1
, r23 C U0 did it carry
154 ldq u1
, 24(r17
) C L0 get next ones
157 addq u0
, v0
, r2 C U1
add two data
158 addq r7
, r22
, r7 C U0
add in carry
159 stq r5
, -24(r16
) C L0 put an answer
160 stq r8
, -16(r16
) C L1 pair
162 cmpult r2
, v0
, cy1 C U1 did it carry
163 beq r7
, $fix
3 C U0 fix exact
0
164 $
ret3: ldq u0
, 32(r17
) C L0 get next ones
167 bis r31
, r31
, r31 C L damp
out
168 addq r2
, r23
, r2 C U1 carry from last
169 bis r31
, r31
, r31 C L moves
in L
!
170 addq u1
, v1
, r5 C U0
add two data
172 beq r2
, $fix
4 C U1 fix exact zero
173 $
ret4: cmpult r5
, v1
, cy0 C U0 did it carry
174 ldq u1
, 40(r17
) C L0 get next ones
177 addq u0
, v0
, r8 C U1
add two data
178 addq r5
, cy1
, r5 C U0 carry from last
179 stq r7
, -8(r16
) C L0 store pair
182 cmpult r8
, v0
, r22 C U1 did it carry
183 beq r5
, $fix
5 C U0 fix exact zero
184 $
ret5: ldq u0
, 48(r17
) C L0 get next ones
187 ldl r31
, 256(r17
) C L0 prefetch
188 addq r8
, cy0
, r8 C U1 carry from last
189 ldl r31
, 256(r18
) C L1 prefetch
190 addq u1
, v1
, r7 C U0
add two data
192 beq r8
, $fix
6 C U1 fix exact zero
193 $
ret6: cmpult r7
, v1
, r23 C U0 did it carry
194 ldq u1
, 56(r17
) C L0 get next ones
197 lda r17
, 64(r17
) C L0 move pointer
198 bis r31
, r31
, r31 C U
199 lda r18
, 64(r18
) C L1 move pointer
200 bge r19
, $
Loop C U1
loop control
203 $
Lend: addq u0
, v0
, r2 C U1
add two data
204 addq r7
, r22
, r7 C U0
add in carry
205 stq r5
, 8(r16
) C L0 put an answer
206 stq r8
, 16(r16
) C L1 pair
207 cmpult r2
, v0
, cy1 C U1 did it carry
208 beq r7
, $fix
7c C U0 fix exact
0
209 $
ret7c: addq r2
, r23
, r2 C U1 carry from last
210 addq u1
, v1
, r5 C U0
add two data
211 beq r2
, $fix
0c C U1 fix exact zero
212 $
ret0c: cmpult r5
, v1
, cy0 C U0 did it carry
213 addq r5
, cy1
, r5 C U0 carry from last
214 stq r7
, 24(r16
) C L0 store pair
216 beq r5
, $fix
1c C U0 fix exact zero
217 $
ret1c: stq r5
, 40(r16
) C L0 put an answer
218 lda r16
, 48(r16
) C L0 move pointer
230 $
Loop0: addq u1
, v1
, r2 C main
add
231 cmpult r2
, v1
, r8 C compute cy from last
add
234 addq r2
, cy0
, r5 C carry
add
238 cmpult r5
, r2
, cy0 C compute cy from last
add
239 lda r19
, -1(r19
) C decr
loop cnt
240 bis r8
, cy0
, cy0 C combine cy from the two adds
243 $
Lend0: addq u1
, v1
, r2 C main
add
244 addq r2
, cy0
, r5 C carry
add
245 cmpult r2
, v1
, r8 C compute cy from last
add
246 cmpult r5
, r2
, cy0 C compute cy from last
add
248 bis r8
, cy0
, r0 C combine cy from the two adds
252 $
Lret: lda r0
, 0(cy0
) C copy carry
into return register
255 $fix
5f
: bis r23
, cy0
, r23 C bring forward carry
257 $fix
6f
: bis r22
, r23
, r22 C bring forward carry
259 $fix
0: bis cy1
, r23
, cy1 C bring forward carry
261 $fix
1: bis cy0
, cy1
, cy0 C bring forward carry
263 $fix
2: bis r22
, cy0
, r22 C bring forward carry
265 $fix
3: bis r23
, r22
, r23 C bring forward carry
267 $fix
4: bis cy1
, r23
, cy1 C bring forward carry
269 $fix
5: bis cy1
, cy0
, cy0 C bring forward carry
271 $fix
6: bis r22
, cy0
, r22 C bring forward carry
273 $fix
7: bis r23
, r22
, r23 C bring forward carry
275 $fix
0c
: bis cy1
, r23
, cy1 C bring forward carry
277 $fix
1c
: bis cy0
, cy1
, cy0 C bring forward carry
279 $fix
7c
: bis r23
, r22
, r23 C bring forward carry