beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / alpha / ev6 / sub_n.asm
bloba35ba40d347c8953f1a2ce8f2223f936ebe13d21
1 dnl Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0
2 dnl and store difference in a third limb vector.
4 dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C cycles/limb
35 C EV4: ?
36 C EV5: 5.4
37 C EV6: 2.125
39 C INPUT PARAMETERS
40 C rp r16
41 C up r17
42 C vp r18
43 C n r19
44 C cy r20 (for mpn_add_nc)
46 C TODO
47 C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
48 C Use multi-pronged feed-in.
49 C Perform additional micro-tuning
51 C This code was written in cooperation with ev6 pipeline expert Steve Root.
53 C Pair loads and stores where possible
54 C Store pairs oct-aligned where possible (didn't need it here)
55 C Stores are delayed every third cycle
56 C Loads and stores are delayed by fills
57 C U stays still, put code there where possible (note alternation of U1 and U0)
58 C L moves because of loads and stores
59 C Note dampers in L to limit damage
61 C This odd-looking optimization expects that were having random bits in our
62 C data, so that a pure zero result is unlikely. so we penalize the unlikely
63 C case to help the common case.
65 define(`u0', `r0') define(`u1', `r3')
66 define(`v0', `r1') define(`v1', `r4')
68 define(`cy0', `r20') define(`cy1', `r21')
70 MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc)
72 ASM_START()
73 PROLOGUE(mpn_sub_nc)
74 br r31, $entry
75 EPILOGUE()
76 PROLOGUE(mpn_sub_n)
77 bis r31, r31, cy0 C clear carry in
78 $entry: cmpult r19, 5, r22 C L1 move counter
79 ldq u1, 0(r17) C L0 get next ones
80 ldq v1, 0(r18) C L1
81 bne r22, $Lsmall
83 ldq u0, 8(r17) C L0 get next ones
84 ldq v0, 8(r18) C L1
85 subq u1, v1, r5 C U0 sub two data
87 cmpult u1, v1, r23 C U0 did it borrow
88 ldq u1, 16(r17) C L0 get next ones
89 ldq v1, 16(r18) C L1
91 subq u0, v0, r8 C U1 sub two data
92 subq r5, cy0, r24 C U0 borrow in
94 cmpult u0, v0, r22 C U1 did it borrow
95 beq r5, $fix5f C U0 fix exact zero
96 $ret5f: ldq u0, 24(r17) C L0 get next ones
97 ldq v0, 24(r18) C L1
99 subq r8, r23, r25 C U1 borrow from last
100 subq u1, v1, r7 C U0 sub two data
102 beq r8, $fix6f C U1 fix exact zero
103 $ret6f: cmpult u1, v1, r23 C U0 did it borrow
104 ldq u1, 32(r17) C L0 get next ones
105 ldq v1, 32(r18) C L1
107 lda r17, 40(r17) C L0 move pointer
108 lda r18, 40(r18) C L1 move pointer
110 lda r16, -8(r16)
111 lda r19, -13(r19) C L1 move counter
112 blt r19, $Lend C U1 loop control
115 C Main loop. 8-way unrolled.
116 ALIGN(16)
117 $Loop: subq u0, v0, r2 C U1 sub two data
118 stq r24, 8(r16) C L0 put an answer
119 subq r7, r22, r24 C U0 borrow from last
120 stq r25, 16(r16) C L1 pair
122 cmpult u0, v0, cy1 C U1 did it borrow
123 beq r7, $fix7 C U0 fix exact 0
124 $ret7: ldq u0, 0(r17) C L0 get next ones
125 ldq v0, 0(r18) C L1
127 bis r31, r31, r31 C L damp out
128 subq r2, r23, r25 C U1 borrow from last
129 bis r31, r31, r31 C L moves in L !
130 subq u1, v1, r5 C U0 sub two data
132 beq r2, $fix0 C U1 fix exact zero
133 $ret0: cmpult u1, v1, cy0 C U0 did it borrow
134 ldq u1, 8(r17) C L0 get next ones
135 ldq v1, 8(r18) C L1
137 subq u0, v0, r8 C U1 sub two data
138 stq r24, 24(r16) C L0 store pair
139 subq r5, cy1, r24 C U0 borrow from last
140 stq r25, 32(r16) C L1
142 cmpult u0, v0, r22 C U1 did it borrow
143 beq r5, $fix1 C U0 fix exact zero
144 $ret1: ldq u0, 16(r17) C L0 get next ones
145 ldq v0, 16(r18) C L1
147 lda r16, 64(r16) C L0 move pointer
148 subq r8, cy0, r25 C U1 borrow from last
149 lda r19, -8(r19) C L1 move counter
150 subq u1, v1, r7 C U0 sub two data
152 beq r8, $fix2 C U1 fix exact zero
153 $ret2: cmpult u1, v1, r23 C U0 did it borrow
154 ldq u1, 24(r17) C L0 get next ones
155 ldq v1, 24(r18) C L1
157 subq u0, v0, r2 C U1 sub two data
158 stq r24, -24(r16) C L0 put an answer
159 subq r7, r22, r24 C U0 borrow from last
160 stq r25, -16(r16) C L1 pair
162 cmpult u0, v0, cy1 C U1 did it borrow
163 beq r7, $fix3 C U0 fix exact 0
164 $ret3: ldq u0, 32(r17) C L0 get next ones
165 ldq v0, 32(r18) C L1
167 bis r31, r31, r31 C L damp out
168 subq r2, r23, r25 C U1 borrow from last
169 bis r31, r31, r31 C L moves in L !
170 subq u1, v1, r5 C U0 sub two data
172 beq r2, $fix4 C U1 fix exact zero
173 $ret4: cmpult u1, v1, cy0 C U0 did it borrow
174 ldq u1, 40(r17) C L0 get next ones
175 ldq v1, 40(r18) C L1
177 subq u0, v0, r8 C U1 sub two data
178 stq r24, -8(r16) C L0 store pair
179 subq r5, cy1, r24 C U0 borrow from last
180 stq r25, 0(r16) C L1
182 cmpult u0, v0, r22 C U1 did it borrow
183 beq r5, $fix5 C U0 fix exact zero
184 $ret5: ldq u0, 48(r17) C L0 get next ones
185 ldq v0, 48(r18) C L1
187 ldl r31, 256(r17) C L0 prefetch
188 subq r8, cy0, r25 C U1 borrow from last
189 ldl r31, 256(r18) C L1 prefetch
190 subq u1, v1, r7 C U0 sub two data
192 beq r8, $fix6 C U1 fix exact zero
193 $ret6: cmpult u1, v1, r23 C U0 did it borrow
194 ldq u1, 56(r17) C L0 get next ones
195 ldq v1, 56(r18) C L1
197 lda r17, 64(r17) C L0 move pointer
198 bis r31, r31, r31 C U
199 lda r18, 64(r18) C L1 move pointer
200 bge r19, $Loop C U1 loop control
201 C ==== main loop end
203 $Lend: subq u0, v0, r2 C U1 sub two data
204 stq r24, 8(r16) C L0 put an answer
205 subq r7, r22, r24 C U0 borrow from last
206 stq r25, 16(r16) C L1 pair
207 cmpult u0, v0, cy1 C U1 did it borrow
208 beq r7, $fix7c C U0 fix exact 0
209 $ret7c: subq r2, r23, r25 C U1 borrow from last
210 subq u1, v1, r5 C U0 sub two data
211 beq r2, $fix0c C U1 fix exact zero
212 $ret0c: cmpult u1, v1, cy0 C U0 did it borrow
213 stq r24, 24(r16) C L0 store pair
214 subq r5, cy1, r24 C U0 borrow from last
215 stq r25, 32(r16) C L1
216 beq r5, $fix1c C U0 fix exact zero
217 $ret1c: stq r24, 40(r16) C L0 put an answer
218 lda r16, 48(r16) C L0 move pointer
220 lda r19, 8(r19)
221 beq r19, $Lret
223 ldq u1, 0(r17)
224 ldq v1, 0(r18)
225 $Lsmall:
226 lda r19, -1(r19)
227 beq r19, $Lend0
229 ALIGN(8)
230 $Loop0: subq u1, v1, r2 C main sub
231 cmpult u1, v1, r8 C compute bw from last sub
232 ldq u1, 8(r17)
233 ldq v1, 8(r18)
234 subq r2, cy0, r5 C borrow sub
235 lda r17, 8(r17)
236 lda r18, 8(r18)
237 stq r5, 0(r16)
238 cmpult r2, cy0, cy0 C compute bw from last sub
239 lda r19, -1(r19) C decr loop cnt
240 bis r8, cy0, cy0 C combine bw from the two subs
241 lda r16, 8(r16)
242 bne r19, $Loop0
243 $Lend0: subq u1, v1, r2 C main sub
244 subq r2, cy0, r5 C borrow sub
245 cmpult u1, v1, r8 C compute bw from last sub
246 cmpult r2, cy0, cy0 C compute bw from last sub
247 stq r5, 0(r16)
248 bis r8, cy0, r0 C combine bw from the two subs
249 ret r31,(r26),1
251 ALIGN(8)
252 $Lret: lda r0, 0(cy0) C copy borrow into return register
253 ret r31,(r26),1
255 $fix5f: bis r23, cy0, r23 C bring forward borrow
256 br r31, $ret5f
257 $fix6f: bis r22, r23, r22 C bring forward borrow
258 br r31, $ret6f
259 $fix0: bis cy1, r23, cy1 C bring forward borrow
260 br r31, $ret0
261 $fix1: bis cy0, cy1, cy0 C bring forward borrow
262 br r31, $ret1
263 $fix2: bis r22, cy0, r22 C bring forward borrow
264 br r31, $ret2
265 $fix3: bis r23, r22, r23 C bring forward borrow
266 br r31, $ret3
267 $fix4: bis cy1, r23, cy1 C bring forward borrow
268 br r31, $ret4
269 $fix5: bis cy1, cy0, cy0 C bring forward borrow
270 br r31, $ret5
271 $fix6: bis r22, cy0, r22 C bring forward borrow
272 br r31, $ret6
273 $fix7: bis r23, r22, r23 C bring forward borrow
274 br r31, $ret7
275 $fix0c: bis cy1, r23, cy1 C bring forward borrow
276 br r31, $ret0c
277 $fix1c: bis cy0, cy1, cy0 C bring forward borrow
278 br r31, $ret1c
279 $fix7c: bis r23, r22, r23 C bring forward borrow
280 br r31, $ret7c
282 EPILOGUE()
283 ASM_END()