1 dnl IA
-64 mpn_addmul_1
-- Multiply a limb vector with a limb
and add the
2 dnl result to a second limb vector.
4 dnl Contributed to the GNU project by Torbjorn Granlund.
6 dnl Copyright
2000-2005, 2007 Free Software Foundation
, Inc.
8 dnl
This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
11 dnl it under the terms of
either:
13 dnl
* the GNU Lesser General
Public License as published by the Free
14 dnl Software Foundation
; either version 3 of the License, or (at your
15 dnl option
) any later version.
19 dnl
* the GNU General
Public License as published by the Free Software
20 dnl Foundation
; either version 2 of the License, or (at your option) any
23 dnl
or both
in parallel
, as here.
25 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
26 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
27 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
30 dnl You should have received copies of the GNU General
Public License
and the
31 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
32 dnl see
https://www.gnu.
org/licenses
/.
34 include(`..
/config.m4
')
41 C * Further optimize feed-in and wind-down code, both for speed and code size.
42 C * Handle low limb input and results specially, using a common stf8 in the
44 C * Use 1 c/l carry propagation scheme in wind-down code.
45 C * Use extra pointer registers for `up' and rp to speed up feed
-in loads.
46 C
* Work
out final differences with mul_1.asm. That function is
300 bytes
47 C smaller than
this due to better
loop scheduling
and thus simpler feed
-in
57 PROLOGUE
(mpn_addmul_1
)
63 ` addp4 rp = 0, rp C M I
64 addp4 up = 0, up C M I
69 adds r15 = -1, n C M I
80 setf.sig f6 = vl C M2 M3
81 cmp.eq p10, p0 = 0, r14 C M I
82 shr.u r31 = r15, 2 C I0
85 cmp.eq p11, p0 = 2, r14 C M I
86 cmp.eq p12, p0 = 3, r14 C M I
91 cmp.ne p6, p7 = r0, r0 C M I
92 mov.i ar.lc = r31 C I0
93 cmp.ne p8, p9 = r0, r0 C M I
96 (p10) br.dptk .Lb00 C B
97 (p11) br.dptk .Lb10 C B
98 (p12) br.dptk .Lb11 C B
102 .
Lb01: br.cloop.dptk .grt1 C B
104 xma.l f39
= f7
, f6
, f8 C F
105 xma.hu f43
= f7
, f6
, f8 C F
107 getf.sig r8
= f43 C M2
108 stf8
[r20
] = f39 C M2 M3
109 mov.i ar.lc
= r2 C I0
110 br.
ret.sptk.many b0 C B
120 xma.l f39
= f7
, f6
, f8
122 xma.hu f43
= f7
, f6
, f8
128 xma.l f36
= f32
, f6
, f44
129 xma.hu f40
= f32
, f6
, f44
132 xma.l f37
= f33
, f6
, f45
133 xma.hu f41
= f33
, f6
, f45
137 xma.l f38
= f34
, f6
, f46
138 xma.hu f42
= f34
, f6
, f46
142 xma.l f39
= f35
, f6
, f47
143 xma.hu f43
= f35
, f6
, f47
151 xma.l f36
= f32
, f6
, f44
152 xma.hu f40
= f32
, f6
, f44
155 xma.l f37
= f33
, f6
, f45
157 xma.hu f41
= f33
, f6
, f45
163 xma.l f38
= f34
, f6
, f46
165 xma.hu f42
= f34
, f6
, f46
171 xma.l f39
= f35
, f6
, f47
173 xma.hu f43
= f35
, f6
, f47
181 .
Lb10: ldf8 f35
= [up
], 8
185 xma.l f38
= f7
, f6
, f8
186 xma.hu f42
= f7
, f6
, f8
188 xma.l f39
= f35
, f6
, f47
189 xma.hu f43
= f35
, f6
, f47
202 xma.l f38
= f7
, f6
, f8
204 xma.hu f42
= f7
, f6
, f8
207 xma.l f39
= f35
, f6
, f47
209 xma.hu f43
= f35
, f6
, f47
216 xma.l f36
= f32
, f6
, f44
217 xma.hu f40
= f32
, f6
, f44
221 xma.l f37
= f33
, f6
, f45
222 xma.hu f41
= f33
, f6
, f45
226 xma.l f38
= f34
, f6
, f46
227 xma.hu f42
= f34
, f6
, f46
231 xma.l f39
= f35
, f6
, f47
232 xma.hu f43
= f35
, f6
, f47
237 xma.l f36
= f32
, f6
, f44
238 xma.hu f40
= f32
, f6
, f44
244 xma.l f37
= f33
, f6
, f45
246 xma.hu f41
= f33
, f6
, f45
252 xma.l f38
= f34
, f6
, f46
254 xma.hu f42
= f34
, f6
, f46
261 .
Lb11: ldf8 f34
= [up
], 8
269 xma.l f37
= f7
, f6
, f8
270 xma.hu f41
= f7
, f6
, f8
271 xma.l f38
= f34
, f6
, f46
272 xma.hu f42
= f34
, f6
, f46
273 xma.l f39
= f35
, f6
, f47
274 xma.hu f43
= f35
, f6
, f47
286 xma.l f37
= f7
, f6
, f8
288 xma.hu f41
= f7
, f6
, f8
291 xma.l f38
= f34
, f6
, f46
293 xma.hu f42
= f34
, f6
, f46
296 xma.l f39
= f35
, f6
, f47
298 xma.hu f43
= f35
, f6
, f47
301 getf.sig r25
= f37 C FIXME
306 stf8
[r20
] = f37
, 8 C FIXME
307 xma.l f36
= f32
, f6
, f44
309 xma.hu f40
= f32
, f6
, f44
312 xma.l f37
= f33
, f6
, f45
314 xma.hu f41
= f33
, f6
, f45
317 xma.l f38
= f34
, f6
, f46
319 xma.hu f42
= f34
, f6
, f46
324 xma.l f36
= f32
, f6
, f44
326 xma.hu f40
= f32
, f6
, f44
332 xma.l f37
= f33
, f6
, f45
334 xma.hu f41
= f33
, f6
, f45
341 .
Lb00: ldf8 f33
= [up
], 8
348 xma.l f36
= f7
, f6
, f8
350 xma.hu f40
= f7
, f6
, f8
353 xma.l f37
= f33
, f6
, f45
354 xma.hu f41
= f33
, f6
, f45
355 xma.l f38
= f34
, f6
, f46
356 xma.hu f42
= f34
, f6
, f46
360 xma.l f39
= f35
, f6
, f47
362 xma.hu f43
= f35
, f6
, f47
372 xma.l f37
= f33
, f6
, f45
374 xma.hu f41
= f33
, f6
, f45
377 xma.l f38
= f34
, f6
, f46
379 xma.hu f42
= f34
, f6
, f46
382 getf.sig r24
= f36 C FIXME
383 xma.l f39
= f35
, f6
, f47
386 xma.hu f43
= f35
, f6
, f47
394 stf8
[r20
] = f36
, 8 C FIXME
395 xma.l f36
= f32
, f6
, f44
398 xma.hu f40
= f32
, f6
, f44
400 xma.l f37
= f33
, f6
, f45
402 xma.hu f41
= f33
, f6
, f45
407 xma.l f36
= f32
, f6
, f44
409 xma.hu f40
= f32
, f6
, f44
416 C
*** MAIN
LOOP START
***
417 ALIGN(32) C insn fed cycle #
419 .pred.rel
"mutex", p6
, p7 C num by i1 i2
420 getf.sig r29
= f41 C
00 16 0 0
421 xma.l f36
= f32
, f6
, f44 C
01 06,15 0 0
422 (p6
) add r14
= r30
, r27
, 1 C
02 0 0
423 ldf8 f47
= [rp
], 8 C
03 0 0
424 xma.hu f40
= f32
, f6
, f44 C
04 06,15 0 0
425 (p7
) add r14
= r30
, r27 C
05 0 0
427 .pred.rel
"mutex", p6
, p7
428 ldf8 f32
= [up
], 8 C
06 1 1
429 (p6
) cmp.leu p8
, p9
= r14
, r27 C
07 1 1
430 (p7
) cmp.ltu p8
, p9
= r14
, r27 C
08 1 1
431 getf.sig r26
= f38 C
09 25 2 1
432 st8
[r20
] = r14
, 8 C
10 2 1
436 .pred.rel
"mutex", p8
, p9
437 getf.sig r30
= f42 C
12 28 3 2
438 xma.l f37
= f33
, f6
, f45 C
13 18,27 3 2
439 (p8
) add r16
= r31
, r24
, 1 C
14 3 2
440 ldf8 f44
= [rp
], 8 C
15 3 2
441 xma.hu f41
= f33
, f6
, f45 C
16 18,27 3 2
442 (p9
) add r16
= r31
, r24 C
17 3 2
444 .pred.rel
"mutex", p8
, p9
445 ldf8 f33
= [up
], 8 C
18 4 3
446 (p8
) cmp.leu p6
, p7
= r16
, r24 C
19 4 3
447 (p9
) cmp.ltu p6
, p7
= r16
, r24 C
20 4 3
448 getf.sig r27
= f39 C
21 37 5 3
449 st8
[r20
] = r16
, 8 C
22 5 3
453 .pred.rel
"mutex", p6
, p7
454 getf.sig r31
= f43 C
24 40 6 4
455 xma.l f38
= f34
, f6
, f46 C
25 30,39 6 4
456 (p6
) add r14
= r28
, r25
, 1 C
26 6 4
457 ldf8 f45
= [rp
], 8 C
27 6 4
458 xma.hu f42
= f34
, f6
, f46 C
28 30,39 6 4
459 (p7
) add r14
= r28
, r25 C
29 6 4
461 .pred.rel
"mutex", p6
, p7
462 ldf8 f34
= [up
], 8 C
30 7 5
463 (p6
) cmp.leu p8
, p9
= r14
, r25 C
31 7 5
464 (p7
) cmp.ltu p8
, p9
= r14
, r25 C
32 7 5
465 getf.sig r24
= f36 C
33 01 8 5
466 st8
[r20
] = r14
, 8 C
34 8 5
470 .pred.rel
"mutex", p8
, p9
471 getf.sig r28
= f40 C
36 04 9 6
472 xma.l f39
= f35
, f6
, f47 C
37 42,03 9 6
473 (p8
) add r16
= r29
, r26
, 1 C
38 9 6
474 ldf8 f46
= [rp
], 8 C
39 9 6
475 xma.hu f43
= f35
, f6
, f47 C
40 42,03 9 6
476 (p9
) add r16
= r29
, r26 C
41 9 6
478 .pred.rel
"mutex", p8
, p9
479 ldf8 f35
= [up
], 8 C
42 10 7
480 (p8
) cmp.leu p6
, p7
= r16
, r26 C
43 10 7
481 (p9
) cmp.ltu p6
, p7
= r16
, r26 C
44 10 7
482 getf.sig r25
= f37 C
45 13 11 7
483 st8
[r20
] = r16
, 8 C
46 11 7
484 br.cloop.dptk .
Loop C
47 11 7
485 C
*** MAIN
LOOP END ***
488 .pred.rel
"mutex", p6
, p7
490 xma.l f36
= f32
, f6
, f44 C
491 (p6
) add r14
= r30
, r27
, 1 C
493 xma.hu f40
= f32
, f6
, f44 C
494 (p7
) add r14
= r30
, r27 C
496 .pred.rel
"mutex", p6
, p7
497 (p6
) cmp.leu p8
, p9
= r14
, r27 C
498 (p7
) cmp.ltu p8
, p9
= r14
, r27 C
502 .pred.rel
"mutex", p8
, p9
504 xma.l f37
= f33
, f6
, f45 C
505 (p8
) add r16
= r31
, r24
, 1 C
506 xma.hu f41
= f33
, f6
, f45 C
507 (p9
) add r16
= r31
, r24 C
509 .pred.rel
"mutex", p8
, p9
510 (p8
) cmp.leu p6
, p7
= r16
, r24 C
511 (p9
) cmp.ltu p6
, p7
= r16
, r24 C
516 .pred.rel
"mutex", p6
, p7
518 xma.l f38
= f34
, f6
, f46 C
519 (p6
) add r14
= r28
, r25
, 1 C
520 xma.hu f42
= f34
, f6
, f46 C
521 (p7
) add r14
= r28
, r25 C
523 .pred.rel
"mutex", p6
, p7
524 (p6
) cmp.leu p8
, p9
= r14
, r25 C
525 (p7
) cmp.ltu p8
, p9
= r14
, r25 C
530 .pred.rel
"mutex", p8
, p9
532 xma.l f39
= f35
, f6
, f47 C
533 (p8
) add r16
= r29
, r26
, 1 C
534 xma.hu f43
= f35
, f6
, f47 C
535 (p9
) add r16
= r29
, r26 C
537 .pred.rel
"mutex", p8
, p9
538 (p8
) cmp.leu p6
, p7
= r16
, r26 C
539 (p9
) cmp.ltu p6
, p7
= r16
, r26 C
544 .pred.rel
"mutex", p6
, p7
546 (p6
) add r14
= r30
, r27
, 1 C
547 (p7
) add r14
= r30
, r27 C
549 .pred.rel
"mutex", p6
, p7
550 (p6
) cmp.leu p8
, p9
= r14
, r27 C
551 (p7
) cmp.ltu p8
, p9
= r14
, r27 C
556 .pred.rel
"mutex", p8
, p9
558 (p8
) add r16
= r31
, r24
, 1 C
559 (p9
) add r16
= r31
, r24 C
561 .pred.rel
"mutex", p8
, p9
562 (p8
) cmp.leu p6
, p7
= r16
, r24 C
563 (p9
) cmp.ltu p6
, p7
= r16
, r24 C
568 .pred.rel
"mutex", p6
, p7
570 (p6
) add r14
= r28
, r25
, 1 C
571 (p7
) add r14
= r28
, r25 C
573 .pred.rel
"mutex", p6
, p7
575 (p6
) cmp.leu p8
, p9
= r14
, r25 C
576 (p7
) cmp.ltu p8
, p9
= r14
, r25 C
579 .pred.rel
"mutex", p8
, p9
580 (p8
) add r16
= r29
, r26
, 1 C
581 (p9
) add r16
= r29
, r26 C
583 .pred.rel
"mutex", p8
, p9
585 (p8
) cmp.leu p6
, p7
= r16
, r26 C
586 (p9
) cmp.ltu p6
, p7
= r16
, r26 C
589 .pred.rel
"mutex", p6
, p7
590 (p6
) add r14
= r30
, r27
, 1 C
591 (p7
) add r14
= r30
, r27 C
593 .pred.rel
"mutex", p6
, p7
595 (p6
) cmp.leu p8
, p9
= r14
, r27 C
596 (p7
) cmp.ltu p8
, p9
= r14
, r27 C
598 (p8
) add r8
= 1, r8 C M I
599 mov.i ar.lc
= r2 C I0
600 br.
ret.sptk.many b0 C B