beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / alpha / ev5 / diveby3.asm
blob3758188e0200545270919fb582ac4ea440a55712
1 dnl Alpha mpn_divexact_by3c -- mpn division by 3, expecting no remainder.
3 dnl Copyright 2004, 2005, 2009 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C EV4: 22
35 C EV5: 11.5
36 C EV6: 6.3 Note that mpn_bdiv_dbm1c is faster
38 C TODO
39 C * Remove the unops, they benefit just ev6, which no longer uses this file.
40 C * Try prefetch for destination, using lds.
41 C * Improve feed-in code, by moving initial mulq earlier; make initial load
42 C to u0/u0 to save some copying.
43 C * Combine u0 and u2, u1 and u3.
45 C INPUT PARAMETERS
46 define(`rp', `r16')
47 define(`up', `r17')
48 define(`n', `r18')
49 define(`cy', `r19')
51 ASM_START()
53 DATASTART(L(LC),8)
54 .quad 0xAAAAAAAAAAAAAAAB
55 .quad 0x5555555555555555
56 .quad 0xAAAAAAAAAAAAAAAA
57 DATAEND()
59 define(`xAAAAAAAAAAAAAAAB', `r20')
60 define(`x5555555555555555', `r21')
61 define(`xAAAAAAAAAAAAAAAA', `r22')
62 define(`u0', `r0') define(`u1', `r1')
63 define(`u2', `r2') define(`u3', `r3')
64 define(`l0', `r25') define(`x', `r8')
65 define(`q0', `r4') define(`q1', `r5')
66 define(`p6', `r6') define(`p7', `r7')
67 define(`t0', `r23') define(`t1', `r24')
68 define(`cymask',`r28')
71 PROLOGUE(mpn_divexact_by3c,gp)
73 ldq r28, 0(up) C load first limb early
75 C Put magic constants in registers
76 lda r0, L(LC)
77 ldq xAAAAAAAAAAAAAAAB, 0(r0)
78 ldq x5555555555555555, 8(r0)
79 ldq xAAAAAAAAAAAAAAAA, 16(r0)
81 C Compute initial l0 value
82 cmpeq cy, 1, p6
83 cmpeq cy, 2, p7
84 negq p6, p6
85 and p6, x5555555555555555, l0
86 cmovne p7, xAAAAAAAAAAAAAAAA, l0
88 C Feed-in depending on (n mod 4)
89 and n, 3, r8
90 lda n, -3(n)
91 cmpeq r8, 1, r4
92 cmpeq r8, 2, r5
93 bne r4, $Lb01
94 bne r5, $Lb10
95 beq r8, $Lb00
97 $Lb11: ldq u3, 8(up)
98 lda up, -24(up)
99 lda rp, -24(rp)
100 mulq r28, xAAAAAAAAAAAAAAAB, q0
101 mov r28, u2
102 br r31, $L11
104 $Lb00: ldq u2, 8(up)
105 lda up, -16(up)
106 lda rp, -16(rp)
107 mulq r28, xAAAAAAAAAAAAAAAB, q1
108 mov r28, u1
109 br r31, $L00
111 $Lb01: lda rp, -8(rp)
112 mulq r28, xAAAAAAAAAAAAAAAB, q0
113 mov r28, u0
114 blt n, $Lcj1
115 ldq u1, 8(up)
116 lda up, -8(up)
117 br r31, $L01
119 $Lb10: ldq u0, 8(up)
120 mulq r28, xAAAAAAAAAAAAAAAB, q1
121 mov r28, u3
122 blt n, $Lend
124 ALIGN(16)
125 $Ltop:
127 cmpult u3, cy, cy C L0
128 mulq u0, xAAAAAAAAAAAAAAAB, q0 C U1
129 ldq u1, 16(up) C L1
130 addq q1, l0, x C U0
132 negq cy, cymask C L0
133 unop C U1
134 unop C L1
135 cmpult x5555555555555555, x, p6 C U0
137 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
138 unop
139 unop
140 negq p6, t0 C L0
142 negq p7, t1 C L0
143 and cymask, x5555555555555555, l0 C U1
144 addq p6, cy, cy
145 and t0, x5555555555555555, t0
147 and t1, x5555555555555555, t1
148 addq p7, cy, cy
149 unop
150 addq t0, l0, l0
152 addq t1, l0, l0
153 unop
154 stq x, 0(rp) C L1
155 unop
156 $L01:
158 cmpult u0, cy, cy C L0
159 mulq u1, xAAAAAAAAAAAAAAAB, q1 C U1
160 ldq u2, 24(up) C L1
161 addq q0, l0, x C U0
163 negq cy, cymask C L0
164 unop C U1
165 unop C L1
166 cmpult x5555555555555555, x, p6 C U0
168 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
169 unop
170 unop
171 negq p6, t0 C L0
173 negq p7, t1 C L0
174 and cymask, x5555555555555555, l0 C U1
175 addq p6, cy, cy
176 and t0, x5555555555555555, t0
178 and t1, x5555555555555555, t1
179 addq p7, cy, cy
180 unop
181 addq t0, l0, l0
183 addq t1, l0, l0
184 unop
185 stq x, 8(rp) C L1
186 unop
187 $L00:
189 cmpult u1, cy, cy C L0
190 mulq u2, xAAAAAAAAAAAAAAAB, q0 C U1
191 ldq u3, 32(up) C L1
192 addq q1, l0, x C U0
194 negq cy, cymask C L0
195 unop C U1
196 unop C L1
197 cmpult x5555555555555555, x, p6 C U0
199 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
200 unop
201 unop
202 negq p6, t0 C L0
204 negq p7, t1 C L0
205 and cymask, x5555555555555555, l0 C U1
206 addq p6, cy, cy
207 and t0, x5555555555555555, t0
209 and t1, x5555555555555555, t1
210 addq p7, cy, cy
211 unop
212 addq t0, l0, l0
214 addq t1, l0, l0
215 unop
216 stq x, 16(rp) C L1
217 unop
218 $L11:
220 cmpult u2, cy, cy C L0
221 mulq u3, xAAAAAAAAAAAAAAAB, q1 C U1
222 ldq u0, 40(up) C L1
223 addq q0, l0, x C U0
225 negq cy, cymask C L0
226 unop C U1
227 unop C L1
228 cmpult x5555555555555555, x, p6 C U0
230 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
231 lda n, -4(n) C L1 bookkeeping
232 unop
233 negq p6, t0 C L0
235 negq p7, t1 C L0
236 and cymask, x5555555555555555, l0 C U1
237 addq p6, cy, cy
238 and t0, x5555555555555555, t0
240 and t1, x5555555555555555, t1
241 addq p7, cy, cy
242 unop
243 addq t0, l0, l0
245 addq t1, l0, l0
246 unop
247 stq x, 24(rp) C L1
248 lda up, 32(up)
250 ldl r31, 256(up) C prefetch
251 unop
252 lda rp, 32(rp)
253 bge n, $Ltop C U1
254 C *** MAIN LOOP END ***
255 $Lend:
257 cmpult u3, cy, cy C L0
258 mulq u0, xAAAAAAAAAAAAAAAB, q0 C U1
259 unop
260 addq q1, l0, x C U0
262 negq cy, cymask C L0
263 unop C U1
264 unop C L1
265 cmpult x5555555555555555, x, p6 C U0
267 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
268 unop
269 unop
270 negq p6, t0 C L0
272 negq p7, t1 C L0
273 and cymask, x5555555555555555, l0 C U1
274 addq p6, cy, cy
275 and t0, x5555555555555555, t0
277 and t1, x5555555555555555, t1
278 addq p7, cy, cy
279 unop
280 addq t0, l0, l0
282 addq t1, l0, l0
283 unop
284 stq x, 0(rp) C L1
285 unop
286 $Lcj1:
287 cmpult u0, cy, cy C L0
288 addq q0, l0, x C U0
289 cmpult x5555555555555555, x, p6 C U0
290 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
291 addq p6, cy, cy
292 addq p7, cy, r0
293 stq x, 8(rp) C L1
295 ret r31,(r26),1
296 EPILOGUE()
297 ASM_END()
299 C This is useful for playing with various schedules.
300 C Expand as: one(0)one(1)one(2)one(3)
301 define(`one',`
303 cmpult `$'eval(($1+3)%4), cy, cy C L0
304 mulq `$'$1, xAAAAAAAAAAAAAAAB, `$'eval(4+$1%2) C U1
305 ldq `$'eval(($1+1)%4), eval($1*8+16)(up) C L1
306 addq `$'eval(4+($1+1)%2), l0, x C U0
308 negq cy, cymask C L0
309 unop C U1
310 unop C L1
311 cmpult x5555555555555555, x, p6 C U0
313 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
314 unop
315 unop
316 negq p6, t0 C L0
318 negq p7, t1 C L0
319 and cymask, x5555555555555555, l0 C U1
320 addq p6, cy, cy
321 and t0, x5555555555555555, t0
323 and t1, x5555555555555555, t1
324 addq p7, cy, cy
325 unop
326 addq t0, l0, l0
328 addq t1, l0, l0
329 unop
330 stq x, eval($1*8)(rp) C L1
331 unop