beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / ia64 / aors_n.asm
blob7705ce61cc922af9676ae5af3e1774da535cd0a8
1 dnl IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2003-2005, 2010, 2011 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C Itanium: 2.67
37 C Itanium 2: 1.25
39 C TODO
40 C * Consider using special code for small n, using something like
41 C "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code.
42 C * The non-nc code was trimmed cycle for cycle to its current state. It is
43 C probably hard to save more that an odd cycle there. The nc code is much
44 C cruder (since tune/speed doesn't have any applicable direct measurements).
45 C * Without the nc entry points, this becomes around 1800 bytes of object
46 C code; the nc code adds over 1000 bytes. We should perhaps sacrifice a
47 C few cycles for the non-nc code and let it fall into the nc code.
49 C INPUT PARAMETERS
50 define(`rp', `r32')
51 define(`up', `r33')
52 define(`vp', `r34')
53 define(`n', `r35')
54 define(`cy', `r36')
56 ifdef(`OPERATION_add_n',`
57 define(ADDSUB, add)
58 define(CND, ltu)
59 define(INCR, 1)
60 define(LIM, -1)
61 define(LIM2, 0)
62 define(func, mpn_add_n)
63 define(func_nc, mpn_add_nc)
65 ifdef(`OPERATION_sub_n',`
66 define(ADDSUB, sub)
67 define(CND, gtu)
68 define(INCR, -1)
69 define(LIM, 0)
70 define(LIM2, -1)
71 define(func, mpn_sub_n)
72 define(func_nc, mpn_sub_nc)
75 define(PFDIST, 500)
77 C Some useful aliases for registers we use
78 define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
79 define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27')
80 define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31')
81 define(`rpx',`r3')
82 define(`upadv',`r20') define(`vpadv',`r21')
84 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
86 ASM_START()
87 PROLOGUE(func_nc)
88 .prologue
89 .save ar.lc, r2
90 .body
91 ifdef(`HAVE_ABI_32',`
92 addp4 rp = 0, rp C M I
93 addp4 up = 0, up C M I
94 nop.i 0
95 addp4 vp = 0, vp C M I
96 nop.m 0
97 zxt4 n = n C I
101 {.mmi; ld8 r11 = [vp], 8 C M01
102 ld8 r10 = [up], 8 C M01
103 mov r2 = ar.lc C I0
104 }{.mmi; and r14 = 7, n C M I
105 cmp.lt p15, p14 = 8, n C M I
106 add n = -6, n C M I
108 }{.mmi; add upadv = PFDIST, up C Merging these lines into the feed-in
109 add vpadv = PFDIST, vp C code could save a cycle per call at
110 mov r23 = cy C the expense of code size.
112 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I
113 cmp.eq p7, p0 = 2, r14 C M I
114 cmp.eq p8, p0 = 3, r14 C M I
115 }{.bbb; (p6) br.dptk .Lc001 C B
116 (p7) br.dptk .Lc010 C B
117 (p8) br.dptk .Lc011 C B
119 }{.mmi; cmp.eq p9, p0 = 4, r14 C M I
120 cmp.eq p10, p0 = 5, r14 C M I
121 cmp.eq p11, p0 = 6, r14 C M I
122 }{.bbb; (p9) br.dptk .Lc100 C B
123 (p10) br.dptk .Lc101 C B
124 (p11) br.dptk .Lc110 C B
126 }{.mmi; ld8 r19 = [vp], 8 C M01
127 ld8 r18 = [up], 8 C M01
128 cmp.ne p13, p0 = 0, cy C copy cy to p13 M I
129 }{.mmb; cmp.eq p12, p0 = 7, r14 C M I
130 nop 0
131 (p12) br.dptk .Lc111 C B
135 .Lc000:
136 {.mmi; ld8 v3 = [vp], 8 C M01
137 ld8 u3 = [up], 8 C M01
138 shr.u n = n, 3 C I0
140 }{.mmi; add vpadv = PFDIST, vp C M I
141 ld8 v0 = [vp], 8 C M01
142 mov ar.lc = n C I0
143 }{.mmi; ld8 u0 = [up], 8 C M01
144 ADDSUB w1 = r10, r11 C M I
145 nop 0
147 }{.mmi; add upadv = PFDIST, up C M I
148 ld8 v1 = [vp], 8 C M01
149 cmp.CND p7, p0 = w1, r10 C M I
150 }{.mmi; ld8 u1 = [up], 8 C M01
151 ADDSUB w2 = r18, r19 C M I
152 add rpx = 8, rp C M I
154 }{.mmi; ld8 v2 = [vp], 8 C M01
155 cmp.CND p8, p0 = w2, r18 C M I
156 (p13) cmpeqor p7, p0 = LIM, w1 C M I
157 }{.mmi; ld8 u2 = [up], 8 C M01
158 (p13) add w1 = INCR, w1 C M I
159 ADDSUB w3 = u3, v3 C M I
161 }{.mmi; ld8 v3 = [vp], 8 C M01
162 cmp.CND p9, p0 = w3, u3 C M I
163 (p7) cmpeqor p8, p0 = LIM, w2 C M I
164 }{.mmb; ld8 u3 = [up], 8 C M01
165 (p7) add w2 = INCR, w2 C M I
166 br L(m0)
169 .Lc001:
170 {.mmi; (p15) ld8 v1 = [vp], 8 C M01
171 (p15) ld8 u1 = [up], 8 C M01
172 ADDSUB w0 = r10, r11 C M I
173 }{.mmb; nop 0
174 nop 0
175 (p15) br L(0)
177 }{.mmi; cmp.ne p9, p0 = 0, r23 C M I
178 mov r8 = 0
179 cmp.CND p6, p0 = w0, r10 C M I
181 }{.mmb; (p9) cmpeqor p6, p0 = LIM, w0 C M I
182 (p9) add w0 = INCR, w0 C M I
183 br L(cj1) C B
185 L(0):
186 {.mmi; ld8 v2 = [vp], 8 C M01
187 ld8 u2 = [up], 8 C M01
188 shr.u n = n, 3 C I0
190 }{.mmi; ld8 v3 = [vp], 8 C M01
191 ld8 u3 = [up], 8 C M01
192 mov ar.lc = n C I0
193 }{.mmi; nop 0
194 cmp.ne p9, p0 = 0, r23 C M I
195 nop 0
197 }{.mmi; ld8 v0 = [vp], 8 C M01
198 cmp.CND p6, p0 = w0, r10 C M I
199 add rpx = 16, rp C M I
200 }{.mmb; ld8 u0 = [up], 8 C M01
201 ADDSUB w1 = u1, v1 C M I
202 br L(c1) C B
205 .Lc010:
206 {.mmi; ld8 v0 = [vp], 8 C M01
207 ld8 u0 = [up], 8 C M01
208 mov r8 = 0 C M I
209 }{.mmb; ADDSUB w3 = r10, r11 C M I
210 cmp.ne p8, p0 = 0, r23 C M I
211 (p15) br L(1) C B
213 }{.mmi; cmp.CND p9, p0 = w3, r10 C M I
214 ADDSUB w0 = u0, v0 C M I
215 (p8) add w3 = INCR, w3 C M I
217 }{.mmb; cmp.CND p6, p0 = w0, u0 C M I
218 (p8) cmpeqor p9, p0 = LIM2, w3 C M I
219 br L(cj2) C B
221 L(1):
222 {.mmi; ld8 v1 = [vp], 8 C M01
223 ld8 u1 = [up], 8 C M01
224 shr.u n = n, 3 C I0
226 }{.mmi; ld8 v2 = [vp], 8 C M01
227 ld8 u2 = [up], 8 C M01
228 mov ar.lc = n C I0
230 }{.mmi; ld8 v3 = [vp], 8 C M01
231 ld8 u3 = [up], 8 C M01
232 cmp.CND p9, p0 = w3, r10 C M I
234 }{.mmi; (p8) cmpeqor p9, p0 = LIM, w3 C M I
235 (p8) add w3 = INCR, w3 C M I
236 ADDSUB w0 = u0, v0 C M I
237 }{.mmb; add rpx = 24, rp C M I
238 nop 0
239 br L(m23) C B
242 .Lc011:
243 {.mmi; ld8 v3 = [vp], 8 C M01
244 ld8 u3 = [up], 8 C M01
245 shr.u n = n, 3 C I0
246 }{.mmi; ADDSUB w2 = r10, r11 C M I
247 cmp.ne p7, p0 = 0, r23 C M I
248 nop 0
250 }{.mmb; ld8 v0 = [vp], 8 C M01
251 ld8 u0 = [up], 8 C M01
252 (p15) br L(2) C B
253 }{.mmi; cmp.CND p8, p0 = w2, r10 C M I
254 ADDSUB w3 = u3, v3 C M I
255 nop 0
257 }{.mmb; (p7) cmpeqor p8, p0 = LIM, w2 C M I
258 (p7) add w2 = INCR, w2 C M I
259 br L(cj3) C B
261 L(2):
262 {.mmi; ld8 v1 = [vp], 8 C M01
263 ld8 u1 = [up], 8 C M01
264 ADDSUB w3 = u3, v3 C M I
266 }{.mmi; ld8 v2 = [vp], 8 C M01
267 ld8 u2 = [up], 8 C M01
268 cmp.CND p8, p0 = w2, r10 C M I
270 }{.mmi; ld8 v3 = [vp], 8 C M01
271 cmp.CND p9, p0 = w3, u3 C M I
272 mov ar.lc = n C I0
273 }{.mmi; ld8 u3 = [up], 8 C M01
274 (p7) cmpeqor p8, p0 = LIM, w2 C M I
275 (p7) add w2 = INCR, w2 C M I
277 }{.mmi; add rpx = 32, rp C M I
278 st8 [rp] = w2, 8 C M23
279 (p8) cmpeqor p9, p0 = LIM, w3 C M I
280 }{.mmb; (p8) add w3 = INCR, w3 C M I
281 ADDSUB w0 = u0, v0 C M I
282 br L(m23)
285 .Lc100:
286 {.mmi; ld8 v2 = [vp], 8 C M01
287 ld8 u2 = [up], 8 C M01
288 shr.u n = n, 3 C I0
289 }{.mmi; ADDSUB w1 = r10, r11 C M I
290 nop 0
291 nop 0
293 }{.mmi; ld8 v3 = [vp], 8 C M01
294 ld8 u3 = [up], 8 C M01
295 add rpx = 8, rp C M I
296 }{.mmi; cmp.ne p6, p0 = 0, r23 C M I
297 cmp.CND p7, p0 = w1, r10 C M I
298 nop 0
300 }{.mmi; ld8 v0 = [vp], 8 C M01
301 ld8 u0 = [up], 8 C M01
302 ADDSUB w2 = u2, v2 C M I
303 }{.mmb; (p6) cmpeqor p7, p0 = LIM, w1 C M I
304 (p6) add w1 = INCR, w1 C M I
305 (p14) br L(cj4)
307 }{.mmi; ld8 v1 = [vp], 8 C M01
308 ld8 u1 = [up], 8 C M01
309 mov ar.lc = n C I0
311 }{.mmi; ld8 v2 = [vp], 8 C M01
312 cmp.CND p8, p0 = w2, u2 C M I
313 nop 0
314 }{.mmi; ld8 u2 = [up], 8 C M01
315 nop 0
316 ADDSUB w3 = u3, v3 C M I
318 }{.mmi; ld8 v3 = [vp], 8 C M01
319 cmp.CND p9, p0 = w3, u3 C M I
320 (p7) cmpeqor p8, p0 = LIM, w2 C M I
321 }{.mmb; ld8 u3 = [up], 8 C M01
322 (p7) add w2 = INCR, w2 C M I
323 br L(m4)
326 .Lc101:
327 {.mmi; ld8 v1 = [vp], 8 C M01
328 ld8 u1 = [up], 8 C M01
329 shr.u n = n, 3 C I0
331 }{.mmi; ld8 v2 = [vp], 8 C M01
332 ld8 u2 = [up], 8 C M01
333 mov ar.lc = n C I0
335 }{.mmi; ld8 v3 = [vp], 8 C M01
336 ld8 u3 = [up], 8 C M01
337 ADDSUB w0 = r10, r11 C M I
338 }{.mmi; cmp.ne p9, p0 = 0, r23 C M I
339 add rpx = 16, rp C M I
340 nop 0
342 }{.mmi; ld8 v0 = [vp], 8 C M01
343 ld8 u0 = [up], 8 C M01
344 cmp.CND p6, p0 = w0, r10 C M I
345 }{.mbb; ADDSUB w1 = u1, v1 C M I
346 (p15) br L(c5) C B
347 br L(end) C B
350 .Lc110:
351 {.mmi; ld8 v0 = [vp], 8 C M01
352 ld8 u0 = [up], 8 C M01
353 shr.u n = n, 3 C I0
355 }{.mmi; add upadv = PFDIST, up C M I
356 add vpadv = PFDIST, vp C M I
357 mov ar.lc = n C I0
358 }{.mmi; ld8 v1 = [vp], 8 C M01
359 ld8 u1 = [up], 8 C M01
360 ADDSUB w3 = r10, r11 C M I
362 }{.mmi; ld8 v2 = [vp], 8 C M01
363 ld8 u2 = [up], 8 C M01
364 ADDSUB w0 = u0, v0 C M I
365 }{.mmi; cmp.CND p9, p0 = w3, r10 C M I
366 cmp.ne p8, p0 = 0, r23 C M I
367 add rpx = 24, rp C M I
369 }{.mmi; ld8 v3 = [vp], 8 C M01
370 ld8 u3 = [up], 8 C M01
371 nop 0
372 }{.mmb; (p8) cmpeqor p9, p0 = LIM, w3 C M I
373 (p8) add w3 = INCR, w3 C M I
374 br L(m67) C B
377 .Lc111:
378 {.mmi; ld8 v0 = [vp], 8 C M01
379 ld8 u0 = [up], 8 C M01
380 shr.u n = n, 3 C I0
382 }{.mmi; add upadv = PFDIST, up C M I
383 ld8 v1 = [vp], 8 C M01
384 mov ar.lc = n C I0
385 }{.mmi; ld8 u1 = [up], 8 C M01
386 ADDSUB w2 = r10, r11 C M I
387 nop 0
389 }{.mmi; add vpadv = PFDIST, vp C M I
390 ld8 v2 = [vp], 8 C M01
391 cmp.CND p8, p0 = w2, r10 C M I
392 }{.mmi; ld8 u2 = [up], 8 C M01
393 ADDSUB w3 = r18, r19 C M I
394 nop 0
396 }{.mmi; ld8 v3 = [vp], 8 C M01
397 cmp.CND p9, p0 = w3, r18 C M I
398 (p13) cmpeqor p8, p0 = LIM, w2 C M I
399 }{.mmi; ld8 u3 = [up], 8 C M01
400 (p13) add w2 = INCR, w2 C M I
401 nop 0
403 }{.mmi; add rpx = 32, rp C M I
404 st8 [rp] = w2, 8 C M23
405 (p8) cmpeqor p9, p0 = LIM, w3 C M I
406 }{.mmb; (p8) add w3 = INCR, w3 C M I
407 ADDSUB w0 = u0, v0 C M I
408 br L(m67)
410 EPILOGUE()
412 PROLOGUE(func)
413 .prologue
414 .save ar.lc, r2
415 .body
416 ifdef(`HAVE_ABI_32',`
417 addp4 rp = 0, rp C M I
418 addp4 up = 0, up C M I
419 nop.i 0
420 addp4 vp = 0, vp C M I
421 nop.m 0
422 zxt4 n = n C I
426 {.mmi; ld8 r11 = [vp], 8 C M01
427 ld8 r10 = [up], 8 C M01
428 mov r2 = ar.lc C I0
429 }{.mmi; and r14 = 7, n C M I
430 cmp.lt p15, p14 = 8, n C M I
431 add n = -6, n C M I
433 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I
434 cmp.eq p7, p0 = 2, r14 C M I
435 cmp.eq p8, p0 = 3, r14 C M I
436 }{.bbb; (p6) br.dptk .Lb001 C B
437 (p7) br.dptk .Lb010 C B
438 (p8) br.dptk .Lb011 C B
440 }{.mmi; cmp.eq p9, p0 = 4, r14 C M I
441 cmp.eq p10, p0 = 5, r14 C M I
442 cmp.eq p11, p0 = 6, r14 C M I
443 }{.bbb; (p9) br.dptk .Lb100 C B
444 (p10) br.dptk .Lb101 C B
445 (p11) br.dptk .Lb110 C B
447 }{.mmi; ld8 r19 = [vp], 8 C M01
448 ld8 r18 = [up], 8 C M01
449 cmp.ne p13, p0 = r0, r0 C clear "CF" M I
450 }{.mmb; cmp.eq p12, p0 = 7, r14 C M I
451 mov r23 = 0 C M I
452 (p12) br.dptk .Lb111 C B
456 .Lb000:
457 {.mmi; ld8 v3 = [vp], 8 C M01
458 ld8 u3 = [up], 8 C M01
459 shr.u n = n, 3 C I0
461 }{.mmi; ld8 v0 = [vp], 8 C M01
462 ld8 u0 = [up], 8 C M01
463 ADDSUB w1 = r10, r11 C M I
465 }{.mmi; ld8 v1 = [vp], 8 C M01
466 cmp.CND p7, p0 = w1, r10 C M I
467 mov ar.lc = n C I0
468 }{.mmi; ld8 u1 = [up], 8 C M01
469 ADDSUB w2 = r18, r19 C M I
470 add rpx = 8, rp C M I
472 }{.mmi; add upadv = PFDIST, up
473 add vpadv = PFDIST, vp
474 cmp.CND p8, p0 = w2, r18 C M I
475 }{.mmi; ld8 v2 = [vp], 8 C M01
476 ld8 u2 = [up], 8 C M01
477 ADDSUB w3 = u3, v3 C M I
479 }{.mmi; ld8 v3 = [vp], 8 C M01
480 cmp.CND p9, p0 = w3, u3 C M I
481 (p7) cmpeqor p8, p0 = LIM, w2 C M I
482 }{.mmb; ld8 u3 = [up], 8 C M01
483 (p7) add w2 = INCR, w2 C M I
484 br L(m0) C B
487 ALIGN(32)
488 .Lb001:
489 {.mmi; ADDSUB w0 = r10, r11 C M I
490 (p15) ld8 v1 = [vp], 8 C M01
491 mov r8 = 0 C M I
493 }{.mmb; cmp.CND p6, p0 = w0, r10 C M I
494 (p15) ld8 u1 = [up], 8 C M01
495 (p14) br L(cj1) C B
497 }{.mmi; add upadv = PFDIST, up
498 add vpadv = PFDIST, vp
499 shr.u n = n, 3 C I0
500 }{.mmi; ld8 v2 = [vp], 8 C M01
501 ld8 u2 = [up], 8 C M01
502 cmp.CND p6, p0 = w0, r10 C M I
504 }{.mmi; ld8 v3 = [vp], 8 C M01
505 ld8 u3 = [up], 8 C M01
506 mov ar.lc = n C I0
508 }{.mmi; ld8 v0 = [vp], 8 C M01
509 ld8 u0 = [up], 8 C M01
510 ADDSUB w1 = u1, v1 C M I
512 }{.mmi; ld8 v1 = [vp], 8 C M01
513 cmp.CND p7, p0 = w1, u1 C M I
514 ADDSUB w2 = u2, v2 C M I
515 }{.mmb; ld8 u1 = [up], 8 C M01
516 add rpx = 16, rp C M I
517 br L(m1) C B
520 ALIGN(32)
521 .Lb010:
522 {.mmi; ld8 v0 = [vp], 8 C M01
523 ld8 u0 = [up], 8 C M01
524 shr.u n = n, 3 C I0
525 }{.mmb; ADDSUB w3 = r10, r11 C M I
526 nop 0
527 (p15) br L(gt2) C B
529 }{.mmi; cmp.CND p9, p0 = w3, r10 C M I
530 ADDSUB w0 = u0, v0 C M I
531 mov r8 = 0 C M I
533 }{.mmb; nop 0
534 cmp.CND p6, p0 = w0, u0 C M I
535 br L(cj2) C B
537 L(gt2):
538 {.mmi; ld8 v1 = [vp], 8 C M01
539 ld8 u1 = [up], 8 C M01
540 nop 0
542 }{.mmi; add upadv = PFDIST, up
543 add vpadv = PFDIST, vp
544 mov ar.lc = n C I0
545 }{.mmi; ld8 v2 = [vp], 8 C M01
546 ld8 u2 = [up], 8 C M01
547 nop 0
549 }{.mmi; ld8 v3 = [vp], 8 C M01
550 cmp.CND p9, p0 = w3, r10 C M I
551 ADDSUB w0 = u0, v0 C M I
552 }{.mmb; ld8 u3 = [up], 8 C M01
553 add rpx = 24, rp C M I
554 br L(m23) C B
557 ALIGN(32)
558 .Lb011:
559 {.mmi; ld8 v3 = [vp], 8 C M01
560 ld8 u3 = [up], 8 C M01
561 ADDSUB w2 = r10, r11 C M I
563 }{.mmb; ld8 v0 = [vp], 8 C M01
564 ld8 u0 = [up], 8 C M01
565 (p15) br L(3) C B
566 }{.mmb; cmp.CND p8, p0 = w2, r10 C M I
567 ADDSUB w3 = u3, v3 C M I
568 br L(cj3) C B
570 L(3):
571 {.mmi; ld8 v1 = [vp], 8 C M01
572 ld8 u1 = [up], 8 C M01
573 shr.u n = n, 3 C I0
575 }{.mmi; add upadv = PFDIST, up
576 add vpadv = PFDIST, vp
577 ADDSUB w3 = u3, v3 C M I
578 }{.mmi; ld8 v2 = [vp], 8 C M01
579 ld8 u2 = [up], 8 C M01
580 cmp.CND p8, p0 = w2, r10 C M I
582 }{.mmi; ld8 v3 = [vp], 8 C M01
583 cmp.CND p9, p0 = w3, u3 C M I
584 mov ar.lc = n C I0
585 }{.mmi; ld8 u3 = [up], 8 C M01
586 nop 0
587 nop 0
589 }{.mmi; add rpx = 32, rp C M I
590 st8 [rp] = w2, 8 C M23
591 (p8) cmpeqor p9, p0 = LIM, w3 C M I
592 }{.mmb; (p8) add w3 = INCR, w3 C M I
593 ADDSUB w0 = u0, v0 C M I
594 br L(m23) C B
597 ALIGN(32)
598 .Lb100:
599 {.mmi; ld8 v2 = [vp], 8 C M01
600 ld8 u2 = [up], 8 C M01
601 shr.u n = n, 3 C I0
603 }{.mmi; ld8 v3 = [vp], 8 C M01
604 ld8 u3 = [up], 8 C M01
605 ADDSUB w1 = r10, r11 C M I
607 }{.mmi; ld8 v0 = [vp], 8 C M01
608 ld8 u0 = [up], 8 C M01
609 cmp.CND p7, p0 = w1, r10 C M I
610 }{.mmb; nop 0
611 ADDSUB w2 = u2, v2 C M I
612 (p14) br L(cj4) C B
615 L(gt4):
616 {.mmi; add upadv = PFDIST, up
617 add vpadv = PFDIST, vp
618 mov ar.lc = n C I0
619 }{.mmi; ld8 v1 = [vp], 8 C M01
620 ld8 u1 = [up], 8 C M01
621 nop 0
623 }{.mmi; ld8 v2 = [vp], 8 C M01
624 cmp.CND p8, p0 = w2, u2 C M I
625 nop 0
626 }{.mmi; ld8 u2 = [up], 8 C M01
627 ADDSUB w3 = u3, v3 C M I
628 add rpx = 8, rp C M I
630 }{.mmi; ld8 v3 = [vp], 8 C M01
631 cmp.CND p9, p0 = w3, u3 C M I
632 (p7) cmpeqor p8, p0 = LIM, w2 C M I
633 }{.mmb; ld8 u3 = [up], 8 C M01
634 (p7) add w2 = INCR, w2 C M I
635 br L(m4) C B
638 ALIGN(32)
639 .Lb101:
640 {.mmi; ld8 v1 = [vp], 8 C M01
641 ld8 u1 = [up], 8 C M01
642 shr.u n = n, 3 C I0
644 }{.mmi; ld8 v2 = [vp], 8 C M01
645 ld8 u2 = [up], 8 C M01
646 ADDSUB w0 = r10, r11 C M I
648 }{.mmi; add upadv = PFDIST, up
649 add vpadv = PFDIST, vp
650 add rpx = 16, rp C M I
651 }{.mmi; ld8 v3 = [vp], 8 C M01
652 ld8 u3 = [up], 8 C M01
653 nop 0
655 }{.mmi; ld8 v0 = [vp], 8 C M01
656 cmp.CND p6, p0 = w0, r10 C M I
657 nop 0
658 }{.mmb; ld8 u0 = [up], 8 C M01
659 ADDSUB w1 = u1, v1 C M I
660 (p14) br L(cj5) C B
663 L(gt5):
664 {.mmi; ld8 v1 = [vp], 8 C M01
665 cmp.CND p7, p0 = w1, u1 C M I
666 mov ar.lc = n C I0
667 }{.mmb; ld8 u1 = [up], 8 C M01
668 ADDSUB w2 = u2, v2 C M I
669 br L(m5) C B
672 ALIGN(32)
673 .Lb110:
674 {.mmi; ld8 v0 = [vp], 8 C M01
675 ld8 u0 = [up], 8 C M01
676 shr.u n = n, 3 C I0
678 }{.mmi; ld8 v1 = [vp], 8 C M01
679 ld8 u1 = [up], 8 C M01
680 ADDSUB w3 = r10, r11 C M I
682 }{.mmi; add upadv = PFDIST, up
683 add vpadv = PFDIST, vp
684 mov ar.lc = n C I0
685 }{.mmi; ld8 v2 = [vp], 8 C M01
686 ld8 u2 = [up], 8 C M01
687 nop 0
689 }{.mmi; ld8 v3 = [vp], 8 C M01
690 cmp.CND p9, p0 = w3, r10 C M I
691 ADDSUB w0 = u0, v0 C M I
692 }{.mmb; ld8 u3 = [up], 8 C M01
693 add rpx = 24, rp C M I
694 br L(m67) C B
697 ALIGN(32)
698 .Lb111:
699 {.mmi; ld8 v0 = [vp], 8 C M01
700 ld8 u0 = [up], 8 C M01
701 shr.u n = n, 3 C I0
703 }{.mmi; ld8 v1 = [vp], 8 C M01
704 ld8 u1 = [up], 8 C M01
705 ADDSUB w2 = r10, r11 C M I
707 }{.mmi; ld8 v2 = [vp], 8 C M01
708 cmp.CND p8, p0 = w2, r10 C M I
709 mov ar.lc = n C I0
710 }{.mmi; ld8 u2 = [up], 8 C M01
711 ADDSUB w3 = r18, r19 C M I
712 nop 0
714 }{.mmi; add upadv = PFDIST, up
715 add vpadv = PFDIST, vp
716 nop 0
717 }{.mmi; ld8 v3 = [vp], 8 C M01
718 ld8 u3 = [up], 8 C M01
719 cmp.CND p9, p0 = w3, r18 C M I
721 }{.mmi; add rpx = 32, rp C M I
722 st8 [rp] = w2, 8 C M23
723 (p8) cmpeqor p9, p0 = LIM, w3 C M I
724 }{.mmb; (p8) add w3 = INCR, w3 C M I
725 ADDSUB w0 = u0, v0 C M I
726 br L(m67) C B
729 C *** MAIN LOOP START ***
730 ALIGN(32)
731 L(top):
732 L(c5): ld8 v1 = [vp], 8 C M01
733 cmp.CND p7, p0 = w1, u1 C M I
734 (p9) cmpeqor p6, p0 = LIM, w0 C M I
735 ld8 u1 = [up], 8 C M01
736 (p9) add w0 = INCR, w0 C M I
737 ADDSUB w2 = u2, v2 C M I
739 L(m5): ld8 v2 = [vp], 8 C M01
740 cmp.CND p8, p0 = w2, u2 C M I
741 (p6) cmpeqor p7, p0 = LIM, w1 C M I
742 ld8 u2 = [up], 8 C M01
743 (p6) add w1 = INCR, w1 C M I
744 ADDSUB w3 = u3, v3 C M I
746 st8 [rp] = w0, 8 C M23
747 ld8 v3 = [vp], 8 C M01
748 cmp.CND p9, p0 = w3, u3 C M I
749 (p7) cmpeqor p8, p0 = LIM, w2 C M I
750 ld8 u3 = [up], 8 C M01
751 (p7) add w2 = INCR, w2 C M I
753 L(m4): st8 [rp] = w1, 16 C M23
754 st8 [rpx] = w2, 32 C M23
755 (p8) cmpeqor p9, p0 = LIM, w3 C M I
756 lfetch [upadv], 64
757 (p8) add w3 = INCR, w3 C M I
758 ADDSUB w0 = u0, v0 C M I
760 L(m23): st8 [rp] = w3, 8 C M23
761 ld8 v0 = [vp], 8 C M01
762 cmp.CND p6, p0 = w0, u0 C M I
763 ld8 u0 = [up], 8 C M01
764 ADDSUB w1 = u1, v1 C M I
765 nop.b 0
767 L(c1): ld8 v1 = [vp], 8 C M01
768 cmp.CND p7, p0 = w1, u1 C M I
769 (p9) cmpeqor p6, p0 = LIM, w0 C M I
770 ld8 u1 = [up], 8 C M01
771 (p9) add w0 = INCR, w0 C M I
772 ADDSUB w2 = u2, v2 C M I
774 L(m1): ld8 v2 = [vp], 8 C M01
775 cmp.CND p8, p0 = w2, u2 C M I
776 (p6) cmpeqor p7, p0 = LIM, w1 C M I
777 ld8 u2 = [up], 8 C M01
778 (p6) add w1 = INCR, w1 C M I
779 ADDSUB w3 = u3, v3 C M I
781 st8 [rp] = w0, 8 C M23
782 ld8 v3 = [vp], 8 C M01
783 cmp.CND p9, p0 = w3, u3 C M I
784 (p7) cmpeqor p8, p0 = LIM, w2 C M I
785 ld8 u3 = [up], 8 C M01
786 (p7) add w2 = INCR, w2 C M I
788 L(m0): st8 [rp] = w1, 16 C M23
789 st8 [rpx] = w2, 32 C M23
790 (p8) cmpeqor p9, p0 = LIM, w3 C M I
791 lfetch [vpadv], 64
792 (p8) add w3 = INCR, w3 C M I
793 ADDSUB w0 = u0, v0 C M I
795 L(m67): st8 [rp] = w3, 8 C M23
796 ld8 v0 = [vp], 8 C M01
797 cmp.CND p6, p0 = w0, u0 C M I
798 ld8 u0 = [up], 8 C M01
799 ADDSUB w1 = u1, v1 C M I
800 br.cloop.dptk L(top) C B
802 C *** MAIN LOOP END ***
804 L(end):
805 {.mmi; (p9) cmpeqor p6, p0 = LIM, w0 C M I
806 (p9) add w0 = INCR, w0 C M I
807 mov ar.lc = r2 C I0
809 L(cj5):
810 {.mmi; cmp.CND p7, p0 = w1, u1 C M I
811 ADDSUB w2 = u2, v2 C M I
812 nop 0
814 }{.mmi; st8 [rp] = w0, 8 C M23
815 (p6) cmpeqor p7, p0 = LIM, w1 C M I
816 (p6) add w1 = INCR, w1 C M I
818 L(cj4):
819 {.mmi; cmp.CND p8, p0 = w2, u2 C M I
820 ADDSUB w3 = u3, v3 C M I
821 nop 0
823 }{.mmi; st8 [rp] = w1, 8 C M23
824 (p7) cmpeqor p8, p0 = LIM, w2 C M I
825 (p7) add w2 = INCR, w2 C M I
827 L(cj3):
828 {.mmi; cmp.CND p9, p0 = w3, u3 C M I
829 ADDSUB w0 = u0, v0 C M I
830 nop 0
832 }{.mmi; st8 [rp] = w2, 8 C M23
833 (p8) cmpeqor p9, p0 = LIM, w3 C M I
834 (p8) add w3 = INCR, w3 C M I
835 }{.mmi; cmp.CND p6, p0 = w0, u0 C M I
836 nop 0
837 mov r8 = 0 C M I
840 L(cj2):
841 {.mmi; st8 [rp] = w3, 8 C M23
842 (p9) cmpeqor p6, p0 = LIM, w0 C M I
843 (p9) add w0 = INCR, w0 C M I
846 L(cj1):
847 {.mmb; st8 [rp] = w0, 8 C M23
848 (p6) mov r8 = 1 C M I
849 br.ret.sptk.many b0 C B
851 EPILOGUE()
852 ASM_END()