1 dnl IA
-64 mpn_rsh1add_n
/mpn_rsh1sub_n
-- rp
[] = (up
[] +- vp
[]) >> 1.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright
2003-2005 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
40 C * Rewrite function entry code using aorslsh1_n.asm style.
41 C * Micro-optimize feed-in and wind-down code.
49 ifdef(`OPERATION_rsh1add_n',`
54 define
(func
, mpn_rsh1add_n
)
56 ifdef(`OPERATION_rsh1sub_n',`
61 define
(func
, mpn_rsh1sub_n
)
64 C Some useful aliases for registers we use
65 define(`u0',`r14
') define(`u1',`r15
') define(`u2',`r16
') define(`u3',`r17
')
66 define(`v0',`r18
') define(`v1',`r19
') define(`v2',`r20
') define(`v3',`r21
')
67 define(`w0',`r22
') define(`w1',`r23
') define(`w2',`r24
') define(`w3',`r25
')
68 define(`x0',`r26
') define(`x1',`r9
') define(`x2',`r30
') define(`x3',`r31
')
70 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
78 addp4 rp
= 0, rp C M I
79 addp4 up
= 0, up C M I
80 addp4 vp
= 0, vp C M I
86 {.mmi; ld8 r11 = [vp], 8 C M01
87 ld8 r10 = [up], 8 C M01
89 }{.mmi; and r14 = 3, n C M I
90 cmp.lt p15, p0 = 4, n C M I
93 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I
94 cmp.eq p7, p0 = 2, r14 C M I
95 cmp.eq p8, p0 = 3, r14 C M I
97 (p6) br.dptk .Lb01 C B
98 (p7) br.dptk .Lb10 C B
99 (p8) br.dptk .Lb11 C B
102 .Lb00: ld8 v0 = [vp], 8 C M01
103 ld8 u0 = [up], 8 C M01
106 ld8 v1 = [vp], 8 C M01
107 ld8 u1 = [up], 8 C M01
108 ADDSUB w3 = r10, r11 C M I
110 ld8 v2 = [vp], 8 C M01
111 ld8 u2 = [up], 8 C M01
112 (p15) br.dpnt .grt4 C B
115 cmp.PRED p7, p0 = w3, r10 C M I
117 ADDSUB w0 = u0, v0 C M I
119 cmp.PRED p8, p0 = w0, u0 C M I
120 ADDSUB w1 = u1, v1 C M I
122 cmp.PRED p9, p0 = w1, u1 C M I
123 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
124 (p7) add w0 = INCR, w0 C M I
126 shrp x3 = w0, w3, 1 C I0
127 ADDSUB w2 = u2, v2 C M I
128 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
129 (p8) add w1 = INCR, w1 C M I
132 .grt4: ld8 v3 = [vp], 8 C M01
133 cmp.PRED p7, p0 = w3, r10 C M I
134 ld8 u3 = [up], 8 C M01
137 ADDSUB w0 = u0, v0 C M I
138 ld8 v0 = [vp], 8 C M01
141 cmp.PRED p8, p0 = w0, u0 C M I
142 ld8 u0 = [up], 8 C M01
143 ADDSUB w1 = u1, v1 C M I
145 ld8 v1 = [vp], 8 C M01
147 cmp.PRED p9, p0 = w1, u1 C M I
148 ld8 u1 = [up], 8 C M01
149 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
150 (p7) add w0 = INCR, w0 C M I
152 ADDSUB w2 = u2, v2 C M I
153 ld8 v2 = [vp], 8 C M01
154 shrp x3 = w0, w3, 1 C I0
155 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
156 (p8) add w1 = INCR, w1 C M I
160 .Lb01: ADDSUB w2 = r10, r11 C M I
162 (p15) br.dpnt .grt1 C B
165 cmp.PRED p6, p7 = w2, r10 C M I
166 shr.u x2 = w2, 1 C I0
169 (p6) dep x2 = -1, x2, 63, 1 C I0
172 .grt1: ld8 v3 = [vp], 8 C M01
173 ld8 u3 = [up], 8 C M01
175 ld8 v0 = [vp], 8 C M01
176 ld8 u0 = [up], 8 C M01
177 mov.i ar.lc = n C FIXME swap with next I0
179 ld8 v1 = [vp], 8 C M01
180 ld8 u1 = [up], 8 C M01
182 ld8 v2 = [vp], 8 C M01
183 ld8 u2 = [up], 8 C M01
184 cmp.PRED p6, p0 = w2, r10 C M I
186 ADDSUB w3 = u3, v3 C M I
187 br.cloop.dptk .grt5 C B
190 cmp.PRED p7, p0 = w3, u3 C M I
192 ADDSUB w0 = u0, v0 C M I
193 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
194 (p6) add w3 = INCR, w3 C M I
196 cmp.PRED p8, p0 = w0, u0 C M I
197 shrp x2 = w3, w2, 1 C I0
198 ADDSUB w1 = u1, v1 C M I
200 cmp.PRED p9, p0 = w1, u1 C M I
201 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
202 (p7) add w0 = INCR, w0 C M I
205 .grt5: ld8 v3 = [vp], 8 C M01
206 cmp.PRED p7, p0 = w3, u3 C M I
207 ld8 u3 = [up], 8 C M01
209 ADDSUB w0 = u0, v0 C M I
210 ld8 v0 = [vp], 8 C M01
211 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
212 (p6) add w3 = INCR, w3 C M I
214 cmp.PRED p8, p0 = w0, u0 C M I
215 shrp x2 = w3, w2, 1 C I0
216 ld8 u0 = [up], 8 C M01
217 ADDSUB w1 = u1, v1 C M I
219 ld8 v1 = [vp], 8 C M01
220 cmp.PRED p9, p0 = w1, u1 C M I
221 ld8 u1 = [up], 8 C M01
222 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
223 (p7) add w0 = INCR, w0 C M I
227 .Lb10: ld8 v2 = [vp], 8 C M01
228 ld8 u2 = [up], 8 C M01
230 ADDSUB w1 = r10, r11 C M I
231 (p15) br.dpnt .grt2 C B
234 cmp.PRED p9, p0 = w1, r10 C M I
236 ADDSUB w2 = u2, v2 C M I
238 cmp.PRED p6, p0 = w2, u2 C M I
240 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
241 (p9) add w2 = INCR, w2 C M I
243 shrp x1 = w2, w1, 1 C I0
244 shr.u x2 = w2, 1 C I0
247 .grt2: ld8 v3 = [vp], 8 C M01
248 ld8 u3 = [up], 8 C M01
250 ld8 v0 = [vp], 8 C M01
251 ld8 u0 = [up], 8 C M01
254 ld8 v1 = [vp], 8 C M01
255 cmp.PRED p9, p0 = w1, r10 C M I
256 ld8 u1 = [up], 8 C M01
259 ADDSUB w2 = u2, v2 C M I
260 ld8 v2 = [vp], 8 C M01
262 cmp.PRED p6, p0 = w2, u2 C M I
263 ld8 u2 = [up], 8 C M01
264 ADDSUB w3 = u3, v3 C M I
265 br.cloop.dptk .grt6 C B
268 cmp.PRED p7, p0 = w3, u3 C M I
269 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
270 (p9) add w2 = INCR, w2 C M I
272 shrp x1 = w2, w1, 1 C I0
273 ADDSUB w0 = u0, v0 C M I
274 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
275 (p6) add w3 = INCR, w3 C M I
278 .grt6: ld8 v3 = [vp], 8 C M01
279 cmp.PRED p7, p0 = w3, u3 C M I
280 ld8 u3 = [up], 8 C M01
281 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
282 (p9) add w2 = INCR, w2 C M I
284 shrp x1 = w2, w1, 1 C I0
285 ADDSUB w0 = u0, v0 C M I
286 ld8 v0 = [vp], 8 C M01
287 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
288 (p6) add w3 = INCR, w3 C M I
292 .Lb11: ld8 v1 = [vp], 8 C M01
293 ld8 u1 = [up], 8 C M01
296 ld8 v2 = [vp], 8 C M01
297 ld8 u2 = [up], 8 C M01
298 ADDSUB w0 = r10, r11 C M I
299 (p15) br.dpnt .grt3 C B
302 cmp.PRED p8, p0 = w0, r10 C M I
303 ADDSUB w1 = u1, v1 C M I
306 cmp.PRED p9, p0 = w1, u1 C M I
308 ADDSUB w2 = u2, v2 C M I
309 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
310 (p8) add w1 = INCR, w1 C M I
312 cmp.PRED p6, p0 = w2, u2 C M I
313 shrp x0 = w1, w0, 1 C I0
315 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
316 (p9) add w2 = INCR, w2 C M I
319 .grt3: ld8 v3 = [vp], 8 C M01
320 ld8 u3 = [up], 8 C M01
322 ld8 v0 = [vp], 8 C M01
324 cmp.PRED p8, p0 = w0, r10 C M I
325 ld8 u0 = [up], 8 C M01
326 ADDSUB w1 = u1, v1 C M I
329 ld8 v1 = [vp], 8 C M01
330 cmp.PRED p9, p0 = w1, u1 C M I
331 ld8 u1 = [up], 8 C M01
333 ADDSUB w2 = u2, v2 C M I
334 ld8 v2 = [vp], 8 C M01
335 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
336 (p8) add w1 = INCR, w1 C M I
338 cmp.PRED p6, p0 = w2, u2 C M I
339 shrp x0 = w1, w0, 1 C I0
340 ld8 u2 = [up], 8 C M01
341 ADDSUB w3 = u3, v3 C M I
342 br.cloop.dptk .grt7 C B
345 cmp.PRED p7, p0 = w3, u3 C M I
346 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
347 (p9) add w2 = INCR, w2 C M I
350 .grt7: ld8 v3 = [vp], 8 C M01
351 cmp.PRED p7, p0 = w3, u3 C M I
352 ld8 u3 = [up], 8 C M01
353 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
354 (p9) add w2 = INCR, w2 C M I
358 C *** MAIN LOOP START ***
360 .Loop: st8 [rp] = x3, 8 C M23
361 ld8 v3 = [vp], 8 C M01
362 cmp.PRED p7, p0 = w3, u3 C M I
363 ld8 u3 = [up], 8 C M01
364 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
365 (p9) add w2 = INCR, w2 C M I
367 .LL11: st8 [rp] = x0, 8 C M23
368 shrp x1 = w2, w1, 1 C I0
369 ADDSUB w0 = u0, v0 C M I
370 ld8 v0 = [vp], 8 C M01
371 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
372 (p6) add w3 = INCR, w3 C M I
374 .LL10: cmp.PRED p8, p0 = w0, u0 C M I
375 shrp x2 = w3, w2, 1 C I0
377 ld8 u0 = [up], 8 C M01
378 ADDSUB w1 = u1, v1 C M I
381 st8 [rp] = x1, 8 C M23
382 ld8 v1 = [vp], 8 C M01
383 cmp.PRED p9, p0 = w1, u1 C M I
384 ld8 u1 = [up], 8 C M01
385 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
386 (p7) add w0 = INCR, w0 C M I
388 .LL01: st8 [rp] = x2, 8 C M23
389 shrp x3 = w0, w3, 1 C I0
390 ADDSUB w2 = u2, v2 C M I
391 ld8 v2 = [vp], 8 C M01
392 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
393 (p8) add w1 = INCR, w1 C M I
395 .LL00: cmp.PRED p6, p0 = w2, u2 C M I
396 shrp x0 = w1, w0, 1 C I0
398 ld8 u2 = [up], 8 C M01
399 ADDSUB w3 = u3, v3 C M I
400 br.cloop.dptk .Loop C B
402 C *** MAIN LOOP END ***
404 .Lskip: st8 [rp] = x3, 8 C M23
405 cmp.PRED p7, p0 = w3, u3 C M I
406 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
407 (p9) add w2 = INCR, w2 C M I
409 .Lcj7: st8 [rp] = x0, 8 C M23
410 shrp x1 = w2, w1, 1 C I0
411 ADDSUB w0 = u0, v0 C M I
412 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
413 (p6) add w3 = INCR, w3 C M I
415 .Lcj6: cmp.PRED p8, p0 = w0, u0 C M I
416 shrp x2 = w3, w2, 1 C I0
417 ADDSUB w1 = u1, v1 C M I
419 st8 [rp] = x1, 8 C M23
420 cmp.PRED p9, p0 = w1, u1 C M I
421 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
422 (p7) add w0 = INCR, w0 C M I
424 .Lcj5: st8 [rp] = x2, 8 C M23
425 shrp x3 = w0, w3, 1 C I0
426 ADDSUB w2 = u2, v2 C M I
427 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
428 (p8) add w1 = INCR, w1 C M I
430 .Lcj4: cmp.PRED p6, p0 = w2, u2 C M I
431 shrp x0 = w1, w0, 1 C I0
433 st8 [rp] = x3, 8 C M23
434 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
435 (p9) add w2 = INCR, w2 C M I
437 .Lcj3: st8 [rp] = x0, 8 C M23
438 shrp x1 = w2, w1, 1 C I0
439 shr.u x2 = w2, 1 C I0
441 .Lcj2: st8 [rp] = x1, 8 C M23
442 (p6) dep x2 = -1, x2, 63, 1 C I0
444 .Lcj1: st8 [rp] = x2 C M23
445 mov.i ar.lc = r2 C I0
446 br.ret.sptk.many b0 C B