1 dnl IA
-64 mpn_add_n
/mpn_sub_n
-- mpn addition
and subtraction.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright
2003-2005, 2010, 2011 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
40 C * Consider using special code for small n, using something like
41 C "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code.
42 C * The non-nc code was trimmed cycle for cycle to its current state. It is
43 C probably hard to save more that an odd cycle there. The nc code is much
44 C cruder (since tune/speed doesn't have any applicable direct measurements
).
45 C
* Without the nc
entry points
, this becomes around
1800 bytes of object
46 C code
; the nc code adds over 1000 bytes. We should perhaps sacrifice a
47 C few cycles for the non
-nc code
and let it fall
into the nc code.
56 ifdef
(`OPERATION_add_n
',`
62 define(func, mpn_add_n)
63 define(func_nc, mpn_add_nc)
65 ifdef
(`OPERATION_sub_n
',`
71 define(func, mpn_sub_n)
72 define(func_nc, mpn_sub_nc)
77 C Some useful aliases for registers we use
78 define
(`u0
',`r14') define
(`u1
',`r15') define
(`u2
',`r16') define
(`u3
',`r17')
79 define
(`v0
',`r24') define
(`v1
',`r25') define
(`v2
',`r26') define
(`v3
',`r27')
80 define
(`w0
',`r28') define
(`w1
',`r29') define
(`w2
',`r30') define
(`w3
',`r31')
82 define
(`upadv
',`r20') define
(`vpadv
',`r21')
84 MULFUNC_PROLOGUE
(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc
)
92 addp4 rp = 0, rp C M I
93 addp4 up = 0, up C M I
95 addp4 vp = 0, vp C M I
101 {.mmi; ld8 r11 = [vp], 8 C M01
102 ld8 r10 = [up], 8 C M01
104 }{.mmi; and r14 = 7, n C M I
105 cmp.lt p15, p14 = 8, n C M I
108 }{.mmi; add upadv = PFDIST, up C Merging these lines into the feed-in
109 add vpadv = PFDIST, vp C code could save a cycle per call at
110 mov r23 = cy C the expense of code size.
112 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I
113 cmp.eq p7, p0 = 2, r14 C M I
114 cmp.eq p8, p0 = 3, r14 C M I
115 }{.bbb; (p6) br.dptk .Lc001 C B
116 (p7) br.dptk .Lc010 C B
117 (p8) br.dptk .Lc011 C B
119 }{.mmi; cmp.eq p9, p0 = 4, r14 C M I
120 cmp.eq p10, p0 = 5, r14 C M I
121 cmp.eq p11, p0 = 6, r14 C M I
122 }{.bbb; (p9) br.dptk .Lc100 C B
123 (p10) br.dptk .Lc101 C B
124 (p11) br.dptk .Lc110 C B
126 }{.mmi; ld8 r19 = [vp], 8 C M01
127 ld8 r18 = [up], 8 C M01
128 cmp.ne p13, p0 = 0, cy C copy cy to p13 M I
129 }{.mmb; cmp.eq p12, p0 = 7, r14 C M I
131 (p12) br.dptk .Lc111 C B
136 {.mmi; ld8 v3 = [vp], 8 C M01
137 ld8 u3 = [up], 8 C M01
140 }{.mmi; add vpadv = PFDIST, vp C M I
141 ld8 v0 = [vp], 8 C M01
143 }{.mmi; ld8 u0 = [up], 8 C M01
144 ADDSUB w1 = r10, r11 C M I
147 }{.mmi; add upadv = PFDIST, up C M I
148 ld8 v1 = [vp], 8 C M01
149 cmp.CND p7, p0 = w1, r10 C M I
150 }{.mmi; ld8 u1 = [up], 8 C M01
151 ADDSUB w2 = r18, r19 C M I
152 add rpx = 8, rp C M I
154 }{.mmi; ld8 v2 = [vp], 8 C M01
155 cmp.CND p8, p0 = w2, r18 C M I
156 (p13) cmpeqor p7, p0 = LIM, w1 C M I
157 }{.mmi; ld8 u2 = [up], 8 C M01
158 (p13) add w1 = INCR, w1 C M I
159 ADDSUB w3 = u3, v3 C M I
161 }{.mmi; ld8 v3 = [vp], 8 C M01
162 cmp.CND p9, p0 = w3, u3 C M I
163 (p7) cmpeqor p8, p0 = LIM, w2 C M I
164 }{.mmb; ld8 u3 = [up], 8 C M01
165 (p7) add w2 = INCR, w2 C M I
170 {.mmi; (p15) ld8 v1 = [vp], 8 C M01
171 (p15) ld8 u1 = [up], 8 C M01
172 ADDSUB w0 = r10, r11 C M I
177 }{.mmi; cmp.ne p9, p0 = 0, r23 C M I
179 cmp.CND p6, p0 = w0, r10 C M I
181 }{.mmb; (p9) cmpeqor p6, p0 = LIM, w0 C M I
182 (p9) add w0 = INCR, w0 C M I
186 {.mmi; ld8 v2 = [vp], 8 C M01
187 ld8 u2 = [up], 8 C M01
190 }{.mmi; ld8 v3 = [vp], 8 C M01
191 ld8 u3 = [up], 8 C M01
194 cmp.ne p9, p0 = 0, r23 C M I
197 }{.mmi; ld8 v0 = [vp], 8 C M01
198 cmp.CND p6, p0 = w0, r10 C M I
199 add rpx = 16, rp C M I
200 }{.mmb; ld8 u0 = [up], 8 C M01
201 ADDSUB w1 = u1, v1 C M I
206 {.mmi; ld8 v0 = [vp], 8 C M01
207 ld8 u0 = [up], 8 C M01
209 }{.mmb; ADDSUB w3 = r10, r11 C M I
210 cmp.ne p8, p0 = 0, r23 C M I
213 }{.mmi; cmp.CND p9, p0 = w3, r10 C M I
214 ADDSUB w0 = u0, v0 C M I
215 (p8) add w3 = INCR, w3 C M I
217 }{.mmb; cmp.CND p6, p0 = w0, u0 C M I
218 (p8) cmpeqor p9, p0 = LIM2, w3 C M I
222 {.mmi; ld8 v1 = [vp], 8 C M01
223 ld8 u1 = [up], 8 C M01
226 }{.mmi; ld8 v2 = [vp], 8 C M01
227 ld8 u2 = [up], 8 C M01
230 }{.mmi; ld8 v3 = [vp], 8 C M01
231 ld8 u3 = [up], 8 C M01
232 cmp.CND p9, p0 = w3, r10 C M I
234 }{.mmi; (p8) cmpeqor p9, p0 = LIM, w3 C M I
235 (p8) add w3 = INCR, w3 C M I
236 ADDSUB w0 = u0, v0 C M I
237 }{.mmb; add rpx = 24, rp C M I
243 {.mmi; ld8 v3 = [vp], 8 C M01
244 ld8 u3 = [up], 8 C M01
246 }{.mmi; ADDSUB w2 = r10, r11 C M I
247 cmp.ne p7, p0 = 0, r23 C M I
250 }{.mmb; ld8 v0 = [vp], 8 C M01
251 ld8 u0 = [up], 8 C M01
253 }{.mmi; cmp.CND p8, p0 = w2, r10 C M I
254 ADDSUB w3 = u3, v3 C M I
257 }{.mmb; (p7) cmpeqor p8, p0 = LIM, w2 C M I
258 (p7) add w2 = INCR, w2 C M I
262 {.mmi; ld8 v1 = [vp], 8 C M01
263 ld8 u1 = [up], 8 C M01
264 ADDSUB w3 = u3, v3 C M I
266 }{.mmi; ld8 v2 = [vp], 8 C M01
267 ld8 u2 = [up], 8 C M01
268 cmp.CND p8, p0 = w2, r10 C M I
270 }{.mmi; ld8 v3 = [vp], 8 C M01
271 cmp.CND p9, p0 = w3, u3 C M I
273 }{.mmi; ld8 u3 = [up], 8 C M01
274 (p7) cmpeqor p8, p0 = LIM, w2 C M I
275 (p7) add w2 = INCR, w2 C M I
277 }{.mmi; add rpx = 32, rp C M I
278 st8 [rp] = w2, 8 C M23
279 (p8) cmpeqor p9, p0 = LIM, w3 C M I
280 }{.mmb; (p8) add w3 = INCR, w3 C M I
281 ADDSUB w0 = u0, v0 C M I
286 {.mmi; ld8 v2 = [vp], 8 C M01
287 ld8 u2 = [up], 8 C M01
289 }{.mmi; ADDSUB w1 = r10, r11 C M I
293 }{.mmi; ld8 v3 = [vp], 8 C M01
294 ld8 u3 = [up], 8 C M01
295 add rpx = 8, rp C M I
296 }{.mmi; cmp.ne p6, p0 = 0, r23 C M I
297 cmp.CND p7, p0 = w1, r10 C M I
300 }{.mmi; ld8 v0 = [vp], 8 C M01
301 ld8 u0 = [up], 8 C M01
302 ADDSUB w2 = u2, v2 C M I
303 }{.mmb; (p6) cmpeqor p7, p0 = LIM, w1 C M I
304 (p6) add w1 = INCR, w1 C M I
307 }{.mmi; ld8 v1 = [vp], 8 C M01
308 ld8 u1 = [up], 8 C M01
311 }{.mmi; ld8 v2 = [vp], 8 C M01
312 cmp.CND p8, p0 = w2, u2 C M I
314 }{.mmi; ld8 u2 = [up], 8 C M01
316 ADDSUB w3 = u3, v3 C M I
318 }{.mmi; ld8 v3 = [vp], 8 C M01
319 cmp.CND p9, p0 = w3, u3 C M I
320 (p7) cmpeqor p8, p0 = LIM, w2 C M I
321 }{.mmb; ld8 u3 = [up], 8 C M01
322 (p7) add w2 = INCR, w2 C M I
327 {.mmi; ld8 v1 = [vp], 8 C M01
328 ld8 u1 = [up], 8 C M01
331 }{.mmi; ld8 v2 = [vp], 8 C M01
332 ld8 u2 = [up], 8 C M01
335 }{.mmi; ld8 v3 = [vp], 8 C M01
336 ld8 u3 = [up], 8 C M01
337 ADDSUB w0 = r10, r11 C M I
338 }{.mmi; cmp.ne p9, p0 = 0, r23 C M I
339 add rpx = 16, rp C M I
342 }{.mmi; ld8 v0 = [vp], 8 C M01
343 ld8 u0 = [up], 8 C M01
344 cmp.CND p6, p0 = w0, r10 C M I
345 }{.mbb; ADDSUB w1 = u1, v1 C M I
351 {.mmi; ld8 v0 = [vp], 8 C M01
352 ld8 u0 = [up], 8 C M01
355 }{.mmi; add upadv = PFDIST, up C M I
356 add vpadv = PFDIST, vp C M I
358 }{.mmi; ld8 v1 = [vp], 8 C M01
359 ld8 u1 = [up], 8 C M01
360 ADDSUB w3 = r10, r11 C M I
362 }{.mmi; ld8 v2 = [vp], 8 C M01
363 ld8 u2 = [up], 8 C M01
364 ADDSUB w0 = u0, v0 C M I
365 }{.mmi; cmp.CND p9, p0 = w3, r10 C M I
366 cmp.ne p8, p0 = 0, r23 C M I
367 add rpx = 24, rp C M I
369 }{.mmi; ld8 v3 = [vp], 8 C M01
370 ld8 u3 = [up], 8 C M01
372 }{.mmb; (p8) cmpeqor p9, p0 = LIM, w3 C M I
373 (p8) add w3 = INCR, w3 C M I
378 {.mmi; ld8 v0 = [vp], 8 C M01
379 ld8 u0 = [up], 8 C M01
382 }{.mmi; add upadv = PFDIST, up C M I
383 ld8 v1 = [vp], 8 C M01
385 }{.mmi; ld8 u1 = [up], 8 C M01
386 ADDSUB w2 = r10, r11 C M I
389 }{.mmi; add vpadv = PFDIST, vp C M I
390 ld8 v2 = [vp], 8 C M01
391 cmp.CND p8, p0 = w2, r10 C M I
392 }{.mmi; ld8 u2 = [up], 8 C M01
393 ADDSUB w3 = r18, r19 C M I
396 }{.mmi; ld8 v3 = [vp], 8 C M01
397 cmp.CND p9, p0 = w3, r18 C M I
398 (p13) cmpeqor p8, p0 = LIM, w2 C M I
399 }{.mmi; ld8 u3 = [up], 8 C M01
400 (p13) add w2 = INCR, w2 C M I
403 }{.mmi; add rpx = 32, rp C M I
404 st8 [rp] = w2, 8 C M23
405 (p8) cmpeqor p9, p0 = LIM, w3 C M I
406 }{.mmb; (p8) add w3 = INCR, w3 C M I
407 ADDSUB w0 = u0, v0 C M I
416 ifdef
(`HAVE_ABI_32
',`
417 addp4 rp = 0, rp C M I
418 addp4 up = 0, up C M I
420 addp4 vp = 0, vp C M I
426 {.mmi; ld8 r11 = [vp], 8 C M01
427 ld8 r10 = [up], 8 C M01
429 }{.mmi; and r14 = 7, n C M I
430 cmp.lt p15, p14 = 8, n C M I
433 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I
434 cmp.eq p7, p0 = 2, r14 C M I
435 cmp.eq p8, p0 = 3, r14 C M I
436 }{.bbb; (p6) br.dptk .Lb001 C B
437 (p7) br.dptk .Lb010 C B
438 (p8) br.dptk .Lb011 C B
440 }{.mmi; cmp.eq p9, p0 = 4, r14 C M I
441 cmp.eq p10, p0 = 5, r14 C M I
442 cmp.eq p11, p0 = 6, r14 C M I
443 }{.bbb; (p9) br.dptk .Lb100 C B
444 (p10) br.dptk .Lb101 C B
445 (p11) br.dptk .Lb110 C B
447 }{.mmi; ld8 r19 = [vp], 8 C M01
448 ld8 r18 = [up], 8 C M01
449 cmp.ne p13, p0 = r0, r0 C clear "CF" M I
450 }{.mmb; cmp.eq p12, p0 = 7, r14 C M I
452 (p12) br.dptk .Lb111 C B
457 {.mmi; ld8 v3 = [vp], 8 C M01
458 ld8 u3 = [up], 8 C M01
461 }{.mmi; ld8 v0 = [vp], 8 C M01
462 ld8 u0 = [up], 8 C M01
463 ADDSUB w1 = r10, r11 C M I
465 }{.mmi; ld8 v1 = [vp], 8 C M01
466 cmp.CND p7, p0 = w1, r10 C M I
468 }{.mmi; ld8 u1 = [up], 8 C M01
469 ADDSUB w2 = r18, r19 C M I
470 add rpx = 8, rp C M I
472 }{.mmi; add upadv = PFDIST, up
473 add vpadv = PFDIST, vp
474 cmp.CND p8, p0 = w2, r18 C M I
475 }{.mmi; ld8 v2 = [vp], 8 C M01
476 ld8 u2 = [up], 8 C M01
477 ADDSUB w3 = u3, v3 C M I
479 }{.mmi; ld8 v3 = [vp], 8 C M01
480 cmp.CND p9, p0 = w3, u3 C M I
481 (p7) cmpeqor p8, p0 = LIM, w2 C M I
482 }{.mmb; ld8 u3 = [up], 8 C M01
483 (p7) add w2 = INCR, w2 C M I
489 {.mmi; ADDSUB w0 = r10, r11 C M I
490 (p15) ld8 v1 = [vp], 8 C M01
493 }{.mmb; cmp.CND p6, p0 = w0, r10 C M I
494 (p15) ld8 u1 = [up], 8 C M01
497 }{.mmi; add upadv = PFDIST, up
498 add vpadv = PFDIST, vp
500 }{.mmi; ld8 v2 = [vp], 8 C M01
501 ld8 u2 = [up], 8 C M01
502 cmp.CND p6, p0 = w0, r10 C M I
504 }{.mmi; ld8 v3 = [vp], 8 C M01
505 ld8 u3 = [up], 8 C M01
508 }{.mmi; ld8 v0 = [vp], 8 C M01
509 ld8 u0 = [up], 8 C M01
510 ADDSUB w1 = u1, v1 C M I
512 }{.mmi; ld8 v1 = [vp], 8 C M01
513 cmp.CND p7, p0 = w1, u1 C M I
514 ADDSUB w2 = u2, v2 C M I
515 }{.mmb; ld8 u1 = [up], 8 C M01
516 add rpx = 16, rp C M I
522 {.mmi; ld8 v0 = [vp], 8 C M01
523 ld8 u0 = [up], 8 C M01
525 }{.mmb; ADDSUB w3 = r10, r11 C M I
529 }{.mmi; cmp.CND p9, p0 = w3, r10 C M I
530 ADDSUB w0 = u0, v0 C M I
534 cmp.CND p6, p0 = w0, u0 C M I
538 {.mmi; ld8 v1 = [vp], 8 C M01
539 ld8 u1 = [up], 8 C M01
542 }{.mmi; add upadv = PFDIST, up
543 add vpadv = PFDIST, vp
545 }{.mmi; ld8 v2 = [vp], 8 C M01
546 ld8 u2 = [up], 8 C M01
549 }{.mmi; ld8 v3 = [vp], 8 C M01
550 cmp.CND p9, p0 = w3, r10 C M I
551 ADDSUB w0 = u0, v0 C M I
552 }{.mmb; ld8 u3 = [up], 8 C M01
553 add rpx = 24, rp C M I
559 {.mmi; ld8 v3 = [vp], 8 C M01
560 ld8 u3 = [up], 8 C M01
561 ADDSUB w2 = r10, r11 C M I
563 }{.mmb; ld8 v0 = [vp], 8 C M01
564 ld8 u0 = [up], 8 C M01
566 }{.mmb; cmp.CND p8, p0 = w2, r10 C M I
567 ADDSUB w3 = u3, v3 C M I
571 {.mmi; ld8 v1 = [vp], 8 C M01
572 ld8 u1 = [up], 8 C M01
575 }{.mmi; add upadv = PFDIST, up
576 add vpadv = PFDIST, vp
577 ADDSUB w3 = u3, v3 C M I
578 }{.mmi; ld8 v2 = [vp], 8 C M01
579 ld8 u2 = [up], 8 C M01
580 cmp.CND p8, p0 = w2, r10 C M I
582 }{.mmi; ld8 v3 = [vp], 8 C M01
583 cmp.CND p9, p0 = w3, u3 C M I
585 }{.mmi; ld8 u3 = [up], 8 C M01
589 }{.mmi; add rpx = 32, rp C M I
590 st8 [rp] = w2, 8 C M23
591 (p8) cmpeqor p9, p0 = LIM, w3 C M I
592 }{.mmb; (p8) add w3 = INCR, w3 C M I
593 ADDSUB w0 = u0, v0 C M I
599 {.mmi; ld8 v2 = [vp], 8 C M01
600 ld8 u2 = [up], 8 C M01
603 }{.mmi; ld8 v3 = [vp], 8 C M01
604 ld8 u3 = [up], 8 C M01
605 ADDSUB w1 = r10, r11 C M I
607 }{.mmi; ld8 v0 = [vp], 8 C M01
608 ld8 u0 = [up], 8 C M01
609 cmp.CND p7, p0 = w1, r10 C M I
611 ADDSUB w2 = u2, v2 C M I
616 {.mmi; add upadv = PFDIST, up
617 add vpadv = PFDIST, vp
619 }{.mmi; ld8 v1 = [vp], 8 C M01
620 ld8 u1 = [up], 8 C M01
623 }{.mmi; ld8 v2 = [vp], 8 C M01
624 cmp.CND p8, p0 = w2, u2 C M I
626 }{.mmi; ld8 u2 = [up], 8 C M01
627 ADDSUB w3 = u3, v3 C M I
628 add rpx = 8, rp C M I
630 }{.mmi; ld8 v3 = [vp], 8 C M01
631 cmp.CND p9, p0 = w3, u3 C M I
632 (p7) cmpeqor p8, p0 = LIM, w2 C M I
633 }{.mmb; ld8 u3 = [up], 8 C M01
634 (p7) add w2 = INCR, w2 C M I
640 {.mmi; ld8 v1 = [vp], 8 C M01
641 ld8 u1 = [up], 8 C M01
644 }{.mmi; ld8 v2 = [vp], 8 C M01
645 ld8 u2 = [up], 8 C M01
646 ADDSUB w0 = r10, r11 C M I
648 }{.mmi; add upadv = PFDIST, up
649 add vpadv = PFDIST, vp
650 add rpx = 16, rp C M I
651 }{.mmi; ld8 v3 = [vp], 8 C M01
652 ld8 u3 = [up], 8 C M01
655 }{.mmi; ld8 v0 = [vp], 8 C M01
656 cmp.CND p6, p0 = w0, r10 C M I
658 }{.mmb; ld8 u0 = [up], 8 C M01
659 ADDSUB w1 = u1, v1 C M I
664 {.mmi; ld8 v1 = [vp], 8 C M01
665 cmp.CND p7, p0 = w1, u1 C M I
667 }{.mmb; ld8 u1 = [up], 8 C M01
668 ADDSUB w2 = u2, v2 C M I
674 {.mmi; ld8 v0 = [vp], 8 C M01
675 ld8 u0 = [up], 8 C M01
678 }{.mmi; ld8 v1 = [vp], 8 C M01
679 ld8 u1 = [up], 8 C M01
680 ADDSUB w3 = r10, r11 C M I
682 }{.mmi; add upadv = PFDIST, up
683 add vpadv = PFDIST, vp
685 }{.mmi; ld8 v2 = [vp], 8 C M01
686 ld8 u2 = [up], 8 C M01
689 }{.mmi; ld8 v3 = [vp], 8 C M01
690 cmp.CND p9, p0 = w3, r10 C M I
691 ADDSUB w0 = u0, v0 C M I
692 }{.mmb; ld8 u3 = [up], 8 C M01
693 add rpx = 24, rp C M I
699 {.mmi; ld8 v0 = [vp], 8 C M01
700 ld8 u0 = [up], 8 C M01
703 }{.mmi; ld8 v1 = [vp], 8 C M01
704 ld8 u1 = [up], 8 C M01
705 ADDSUB w2 = r10, r11 C M I
707 }{.mmi; ld8 v2 = [vp], 8 C M01
708 cmp.CND p8, p0 = w2, r10 C M I
710 }{.mmi; ld8 u2 = [up], 8 C M01
711 ADDSUB w3 = r18, r19 C M I
714 }{.mmi; add upadv = PFDIST, up
715 add vpadv = PFDIST, vp
717 }{.mmi; ld8 v3 = [vp], 8 C M01
718 ld8 u3 = [up], 8 C M01
719 cmp.CND p9, p0 = w3, r18 C M I
721 }{.mmi; add rpx = 32, rp C M I
722 st8 [rp] = w2, 8 C M23
723 (p8) cmpeqor p9, p0 = LIM, w3 C M I
724 }{.mmb; (p8) add w3 = INCR, w3 C M I
725 ADDSUB w0 = u0, v0 C M I
729 C
*** MAIN
LOOP START
***
732 L
(c5
): ld8 v1
= [vp
], 8 C M01
733 cmp.CND p7
, p0
= w1
, u1 C M I
734 (p9
) cmpeqor p6
, p0
= LIM
, w0 C M I
735 ld8 u1
= [up
], 8 C M01
736 (p9
) add w0
= INCR
, w0 C M I
737 ADDSUB w2
= u2
, v2 C M I
739 L
(m5
): ld8 v2
= [vp
], 8 C M01
740 cmp.CND p8
, p0
= w2
, u2 C M I
741 (p6
) cmpeqor p7
, p0
= LIM
, w1 C M I
742 ld8 u2
= [up
], 8 C M01
743 (p6
) add w1
= INCR
, w1 C M I
744 ADDSUB w3
= u3
, v3 C M I
746 st8
[rp
] = w0
, 8 C M23
747 ld8 v3
= [vp
], 8 C M01
748 cmp.CND p9
, p0
= w3
, u3 C M I
749 (p7
) cmpeqor p8
, p0
= LIM
, w2 C M I
750 ld8 u3
= [up
], 8 C M01
751 (p7
) add w2
= INCR
, w2 C M I
753 L
(m4
): st8
[rp
] = w1
, 16 C M23
754 st8
[rpx
] = w2
, 32 C M23
755 (p8
) cmpeqor p9
, p0
= LIM
, w3 C M I
757 (p8
) add w3
= INCR
, w3 C M I
758 ADDSUB w0
= u0
, v0 C M I
760 L
(m23
): st8
[rp
] = w3
, 8 C M23
761 ld8 v0
= [vp
], 8 C M01
762 cmp.CND p6
, p0
= w0
, u0 C M I
763 ld8 u0
= [up
], 8 C M01
764 ADDSUB w1
= u1
, v1 C M I
767 L
(c1
): ld8 v1
= [vp
], 8 C M01
768 cmp.CND p7
, p0
= w1
, u1 C M I
769 (p9
) cmpeqor p6
, p0
= LIM
, w0 C M I
770 ld8 u1
= [up
], 8 C M01
771 (p9
) add w0
= INCR
, w0 C M I
772 ADDSUB w2
= u2
, v2 C M I
774 L
(m1
): ld8 v2
= [vp
], 8 C M01
775 cmp.CND p8
, p0
= w2
, u2 C M I
776 (p6
) cmpeqor p7
, p0
= LIM
, w1 C M I
777 ld8 u2
= [up
], 8 C M01
778 (p6
) add w1
= INCR
, w1 C M I
779 ADDSUB w3
= u3
, v3 C M I
781 st8
[rp
] = w0
, 8 C M23
782 ld8 v3
= [vp
], 8 C M01
783 cmp.CND p9
, p0
= w3
, u3 C M I
784 (p7
) cmpeqor p8
, p0
= LIM
, w2 C M I
785 ld8 u3
= [up
], 8 C M01
786 (p7
) add w2
= INCR
, w2 C M I
788 L
(m0
): st8
[rp
] = w1
, 16 C M23
789 st8
[rpx
] = w2
, 32 C M23
790 (p8
) cmpeqor p9
, p0
= LIM
, w3 C M I
792 (p8
) add w3
= INCR
, w3 C M I
793 ADDSUB w0
= u0
, v0 C M I
795 L
(m67
): st8
[rp
] = w3
, 8 C M23
796 ld8 v0
= [vp
], 8 C M01
797 cmp.CND p6
, p0
= w0
, u0 C M I
798 ld8 u0
= [up
], 8 C M01
799 ADDSUB w1
= u1
, v1 C M I
800 br.cloop.dptk L
(top
) C B
802 C
*** MAIN
LOOP END ***
805 {.mmi; (p9) cmpeqor p6, p0 = LIM, w0 C M I
806 (p9) add w0 = INCR, w0 C M I
810 {.mmi; cmp.CND p7, p0 = w1, u1 C M I
811 ADDSUB w2 = u2, v2 C M I
814 }{.mmi; st8 [rp] = w0, 8 C M23
815 (p6) cmpeqor p7, p0 = LIM, w1 C M I
816 (p6) add w1 = INCR, w1 C M I
819 {.mmi; cmp.CND p8, p0 = w2, u2 C M I
820 ADDSUB w3 = u3, v3 C M I
823 }{.mmi; st8 [rp] = w1, 8 C M23
824 (p7) cmpeqor p8, p0 = LIM, w2 C M I
825 (p7) add w2 = INCR, w2 C M I
828 {.mmi; cmp.CND p9, p0 = w3, u3 C M I
829 ADDSUB w0 = u0, v0 C M I
832 }{.mmi; st8 [rp] = w2, 8 C M23
833 (p8) cmpeqor p9, p0 = LIM, w3 C M I
834 (p8) add w3 = INCR, w3 C M I
835 }{.mmi; cmp.CND p6, p0 = w0, u0 C M I
841 {.mmi; st8 [rp] = w3, 8 C M23
842 (p9) cmpeqor p6, p0 = LIM, w0 C M I
843 (p9) add w0 = INCR, w0 C M I
847 {.mmb; st8 [rp] = w0, 8 C M23
848 (p6) mov r8 = 1 C M I
849 br.ret.sptk.many b0 C B