1 dnl IA
-64 mpn_mul_1
, mpn_mul_1c
-- Multiply a limb vector with a limb
and
2 dnl store the result
in a second limb vector.
4 dnl Contributed to the GNU project by Torbjorn Granlund.
6 dnl Copyright
2000-2004, 2006, 2007 Free Software Foundation
, Inc.
8 dnl
This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
11 dnl it under the terms of
either:
13 dnl
* the GNU Lesser General
Public License as published by the Free
14 dnl Software Foundation
; either version 3 of the License, or (at your
15 dnl option
) any later version.
19 dnl
* the GNU General
Public License as published by the Free Software
20 dnl Foundation
; either version 2 of the License, or (at your option) any
23 dnl
or both
in parallel
, as here.
25 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
26 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
27 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
30 dnl You should have received copies of the GNU General
Public License
and the
31 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
32 dnl see
https://www.gnu.
org/licenses
/.
34 include(`..
/config.m4
')
41 C * Further optimize feed-in and wind-down code, both for speed and code size.
42 C * Handle low limb input and results specially, using a common stf8 in the
44 C * Use 1 c/l carry propagation scheme in wind-down code.
45 C * Use extra pointer register for `up' to speed up feed
-in loads.
46 C
* Work
out final differences with addmul_1.asm.
53 define
(`cy
', `r36') C for mpn_mul_1c
62 ` addp4 rp = 0, rp C M I
63 addp4 up = 0, up C M I
68 adds r15 = -1, n C M I
80 setf.sig f6 = vl C M2 M3
81 shr.u r31 = r15, 2 C I0
82 cmp.eq p10, p0 = 0, r14 C M I
85 cmp.eq p11, p0 = 2, r14 C M I
86 cmp.eq p12, p0 = 3, r14 C M I
91 cmp.ne p6, p7 = r0, r0 C M I
92 mov.i ar.lc = r31 C I0
93 cmp.ne p8, p9 = r0, r0 C M I
96 (p10) br.dptk .Lb00 C B
97 (p11) br.dptk .Lb10 C B
98 (p12) br.dptk .Lb11 C B
103 br.cloop.dptk .grt1 C B
105 xma.l f39
= f7
, f6
, f9 C F
106 xma.hu f43
= f7
, f6
, f9 C F
108 getf.sig r8
= f43 C M2
109 stf8
[rp
] = f39 C M2 M3
110 mov.i ar.lc
= r2 C I0
111 br.
ret.sptk.many b0 C B
119 xma.l f39
= f7
, f6
, f9
120 xma.hu f43
= f7
, f6
, f9
125 xma.l f36
= f32
, f6
, f0
126 xma.hu f40
= f32
, f6
, f0
129 xma.l f37
= f33
, f6
, f0
130 xma.hu f41
= f33
, f6
, f0
134 xma.l f38
= f34
, f6
, f0
135 xma.hu f42
= f34
, f6
, f0
139 xma.l f39
= f35
, f6
, f0
140 xma.hu f43
= f35
, f6
, f0
147 xma.l f36
= f32
, f6
, f0
148 xma.hu f40
= f32
, f6
, f0
152 xma.l f37
= f33
, f6
, f0
153 xma.hu f41
= f33
, f6
, f0
157 xma.l f38
= f34
, f6
, f0
160 xma.hu f42
= f34
, f6
, f0
164 xma.l f39
= f35
, f6
, f0
167 xma.hu f43
= f35
, f6
, f0
171 .
Lb10: ldf8 f35
= [up
], 8
175 xma.l f38
= f7
, f6
, f9
176 xma.hu f42
= f7
, f6
, f9
179 xma.l f39
= f35
, f6
, f42
180 xma.hu f43
= f35
, f6
, f42
192 xma.l f38
= f7
, f6
, f9
193 xma.hu f42
= f7
, f6
, f9
196 xma.l f39
= f35
, f6
, f0
197 xma.hu f43
= f35
, f6
, f0
203 xma.l f36
= f32
, f6
, f0
204 xma.hu f40
= f32
, f6
, f0
208 xma.l f37
= f33
, f6
, f0
209 xma.hu f41
= f33
, f6
, f0
213 xma.l f38
= f34
, f6
, f0
214 xma.hu f42
= f34
, f6
, f0
218 xma.l f39
= f35
, f6
, f0
219 xma.hu f43
= f35
, f6
, f0
224 xma.l f36
= f32
, f6
, f0
225 xma.hu f40
= f32
, f6
, f0
229 xma.l f37
= f33
, f6
, f0
232 xma.hu f41
= f33
, f6
, f0
236 xma.l f38
= f34
, f6
, f0
239 xma.hu f42
= f34
, f6
, f0
243 .
Lb11: ldf8 f34
= [up
], 8
250 xma.l f37
= f7
, f6
, f9
251 xma.hu f41
= f7
, f6
, f9
252 xma.l f38
= f34
, f6
, f0
253 xma.hu f42
= f34
, f6
, f0
254 xma.l f39
= f35
, f6
, f0
255 xma.hu f43
= f35
, f6
, f0
267 xma.l f37
= f7
, f6
, f9
268 xma.hu f41
= f7
, f6
, f9
271 xma.l f38
= f34
, f6
, f0
272 xma.hu f42
= f34
, f6
, f0
276 xma.l f39
= f35
, f6
, f0
277 xma.hu f43
= f35
, f6
, f0
284 xma.l f36
= f32
, f6
, f0
286 xma.hu f40
= f32
, f6
, f0
289 xma.l f37
= f33
, f6
, f0
291 xma.hu f41
= f33
, f6
, f0
295 xma.l f38
= f34
, f6
, f0
296 xma.hu f42
= f34
, f6
, f0
301 xma.l f36
= f32
, f6
, f0
302 xma.hu f40
= f32
, f6
, f0
306 xma.l f37
= f33
, f6
, f0
309 xma.hu f41
= f33
, f6
, f0
313 .
Lb00: ldf8 f33
= [up
], 8
319 xma.l f36
= f7
, f6
, f9
320 xma.hu f40
= f7
, f6
, f9
323 xma.l f37
= f33
, f6
, f0
324 xma.hu f41
= f33
, f6
, f0
325 xma.l f38
= f34
, f6
, f0
326 xma.hu f42
= f34
, f6
, f0
330 xma.l f39
= f35
, f6
, f0
332 xma.hu f43
= f35
, f6
, f0
342 xma.l f37
= f33
, f6
, f0
343 xma.hu f41
= f33
, f6
, f0
347 xma.l f38
= f34
, f6
, f0
348 xma.hu f42
= f34
, f6
, f0
352 xma.l f39
= f35
, f6
, f0
356 xma.hu f43
= f35
, f6
, f0
361 xma.l f36
= f32
, f6
, f0
363 xma.hu f40
= f32
, f6
, f0
367 xma.l f37
= f33
, f6
, f0
368 xma.hu f41
= f33
, f6
, f0
373 xma.l f36
= f32
, f6
, f0
374 xma.hu f40
= f32
, f6
, f0
378 C
*** MAIN
LOOP START
***
381 .pred.rel
"mutex",p6
,p7
383 xma.l f36
= f32
, f6
, f0
384 (p6
) cmp.leu p8
, p9
= r24
, r17
386 xma.hu f40
= f32
, f6
, f0
387 (p7
) cmp.ltu p8
, p9
= r24
, r17
390 .pred.rel
"mutex",p8
,p9
392 (p8
) add r24
= r18
, r21
, 1
395 (p9
) add r24
= r18
, r21
398 .pred.rel
"mutex",p8
,p9
400 xma.l f37
= f33
, f6
, f0
401 (p8
) cmp.leu p6
, p7
= r24
, r18
403 xma.hu f41
= f33
, f6
, f0
404 (p9
) cmp.ltu p6
, p7
= r24
, r18
407 .pred.rel
"mutex",p6
,p7
409 (p6
) add r24
= r19
, r22
, 1
412 (p7
) add r24
= r19
, r22
415 .pred.rel
"mutex",p6
,p7
417 xma.l f38
= f34
, f6
, f0
418 (p6
) cmp.leu p8
, p9
= r24
, r19
420 xma.hu f42
= f34
, f6
, f0
421 (p7
) cmp.ltu p8
, p9
= r24
, r19
424 .pred.rel
"mutex",p8
,p9
426 (p8
) add r24
= r16
, r23
, 1
429 (p9
) add r24
= r16
, r23
432 .pred.rel
"mutex",p8
,p9
434 xma.l f39
= f35
, f6
, f0
435 (p8
) cmp.leu p6
, p7
= r24
, r16
437 xma.hu f43
= f35
, f6
, f0
438 (p9
) cmp.ltu p6
, p7
= r24
, r16
441 .pred.rel
"mutex",p6
,p7
443 (p6
) add r24
= r17
, r20
, 1
446 (p7
) add r24
= r17
, r20
448 C
*** MAIN
LOOP END ***
452 .pred.rel
"mutex",p6
,p7
454 xma.l f36
= f32
, f6
, f0
455 (p6
) cmp.leu p8
, p9
= r24
, r17
457 xma.hu f40
= f32
, f6
, f0
458 (p7
) cmp.ltu p8
, p9
= r24
, r17
460 .pred.rel
"mutex",p8
,p9
462 (p8
) add r24
= r18
, r21
, 1
463 (p9
) add r24
= r18
, r21
465 .pred.rel
"mutex",p8
,p9
467 xma.l f37
= f33
, f6
, f0
468 (p8
) cmp.leu p6
, p7
= r24
, r18
470 xma.hu f41
= f33
, f6
, f0
471 (p9
) cmp.ltu p6
, p7
= r24
, r18
474 .pred.rel
"mutex",p6
,p7
476 (p6
) add r24
= r19
, r22
, 1
477 (p7
) add r24
= r19
, r22
479 .pred.rel
"mutex",p6
,p7
481 xma.l f38
= f34
, f6
, f0
482 (p6
) cmp.leu p8
, p9
= r24
, r19
484 xma.hu f42
= f34
, f6
, f0
485 (p7
) cmp.ltu p8
, p9
= r24
, r19
488 .pred.rel
"mutex",p8
,p9
490 (p8
) add r24
= r16
, r23
, 1
491 (p9
) add r24
= r16
, r23
493 .pred.rel
"mutex",p8
,p9
495 xma.l f39
= f35
, f6
, f0
496 (p8
) cmp.leu p6
, p7
= r24
, r16
498 xma.hu f43
= f35
, f6
, f0
499 (p9
) cmp.ltu p6
, p7
= r24
, r16
502 .pred.rel
"mutex",p6
,p7
504 (p6
) add r24
= r17
, r20
, 1
505 (p7
) add r24
= r17
, r20
507 .pred.rel
"mutex",p6
,p7
508 (p6
) cmp.leu p8
, p9
= r24
, r17
509 (p7
) cmp.ltu p8
, p9
= r24
, r17
514 .pred.rel
"mutex",p8
,p9
516 (p8
) add r24
= r18
, r21
, 1
517 (p9
) add r24
= r18
, r21
519 .pred.rel
"mutex",p8
,p9
520 (p8
) cmp.leu p6
, p7
= r24
, r18
521 (p9
) cmp.ltu p6
, p7
= r24
, r18
526 .pred.rel
"mutex",p6
,p7
528 (p6
) add r24
= r19
, r22
, 1
529 (p7
) add r24
= r19
, r22
531 .pred.rel
"mutex",p6
,p7
533 (p6
) cmp.leu p8
, p9
= r24
, r19
534 (p7
) cmp.ltu p8
, p9
= r24
, r19
537 .pred.rel
"mutex",p8
,p9
538 (p8
) add r24
= r16
, r23
, 1
539 (p9
) add r24
= r16
, r23
541 .pred.rel
"mutex",p8
,p9
543 (p8
) cmp.leu p6
, p7
= r24
, r16
544 (p9
) cmp.ltu p6
, p7
= r24
, r16
547 .pred.rel
"mutex",p6
,p7
548 (p6
) add r24
= r17
, r20
, 1
549 (p7
) add r24
= r17
, r20
551 .pred.rel
"mutex",p6
,p7
553 (p6
) cmp.leu p8
, p9
= r24
, r17
554 (p7
) cmp.ltu p8
, p9
= r24
, r17
567 ` addp4 rp = 0, rp C M I
568 addp4 up = 0, up C M I
573 adds r15 = -1, n C M I
574 setf.sig f9 = cy C M2 M3
575 mov.i r2 = ar.lc C I0
578 ldf8 f7 = [up], 8 C M