1 dnl IA
-64 mpn_mul_2
-- Multiply a n
-limb number with a
2-limb number
and store
2 dnl store the result to a
(n
+1)-limb number.
4 dnl Contributed to the GNU project by Torbjorn Granlund.
6 dnl Copyright
2004, 2011 Free Software Foundation
, Inc.
8 dnl
This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
11 dnl it under the terms of
either:
13 dnl
* the GNU Lesser General
Public License as published by the Free
14 dnl Software Foundation
; either version 3 of the License, or (at your
15 dnl option
) any later version.
19 dnl
* the GNU General
Public License as published by the Free Software
20 dnl Foundation
; either version 2 of the License, or (at your option) any
23 dnl
or both
in parallel
, as here.
25 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
26 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
27 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
30 dnl You should have received copies of the GNU General
Public License
and the
31 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
32 dnl see
https://www.gnu.
org/licenses
/.
34 include(`..
/config.m4
')
41 C * Clean up variable names, and try to decrease the number of distinct
43 C * Clean up feed-in code to not require zeroing several registers.
44 C * Make sure we don't depend on uninitialized predicate registers.
45 C
* Could perhaps save a few cycles by using
1 c
/l carry propagation
in
47 C
* Ultimately rewrite. The problem with
this code is that it first uses a
48 C loaded u value
in one xma pair
, then leaves it live over several unrelated
49 C xma pairs
, before it uses it again. It should actually be quite possible
50 C to just swap some aligned xma pairs around. But we should then schedule
51 C u loads further from the first use.
67 define
(`pr0_0
',`r16') define
(`pr0_1
',`r17')
68 define
(`pr0_2
',`r18') define
(`pr0_3
',`r19')
70 define
(`pr1_0
',`r20') define
(`pr1_1
',`r21')
71 define
(`pr1_2
',`r22') define
(`pr1_3
',`r23')
73 define
(`acc1_0
',`r24') define
(`acc1_1
',`r25')
74 define
(`acc1_2
',`r26') define
(`acc1_3
',`r27')
81 define
(`fp0b_0
',`f8') define
(`fp0b_1
',`f9')
82 define
(`fp0b_2
',`f10') define
(`fp0b_3
',`f11')
84 define
(`fp1a_0
',`f12') define
(`fp1a_1
',`f13')
85 define
(`fp1a_2
',`f14') define
(`fp1a_3
',`f15')
87 define
(`fp1b_0
',`f32') define
(`fp1b_1
',`f33')
88 define
(`fp1b_2
',`f34') define
(`fp1b_3
',`f35')
90 define
(`fp2a_0
',`f36') define
(`fp2a_1
',`f37')
91 define
(`fp2a_2
',`f38') define
(`fp2a_3
',`f39')
93 define
(`u_0
',`f44') define
(`u_1
',`f45')
94 define
(`u_2
',`f46') define
(`u_3
',`f47')
105 ifdef
(`HAVE_ABI_32
',`
106 {.mmi; addp4 rp = 0, rp C M I
107 addp4 up = 0, up C M I
108 addp4 vp = 0, vp C M I
115 {.mmi; ldf8 ux = [up], 8 C M
116 ldf8 v0 = [vp], 8 C M
122 }{.mmi; ldf8 uy = [up], 8 C M
126 cmp.eq p10, p0 = 1, r14 C M I
127 cmp.eq p11, p0 = 2, r14 C M I
130 cmp.eq p12, p0 = 3, r14 C M I
132 }{.bbb; (p10) br.dptk L(b01) C B
133 (p11) br.dptk L(b10) C B
134 (p12) br.dptk L(b11) C B
138 L
(b00
): ldf8 u_1
= [up
], 8
142 cmp.ne p8
, p9
= r0
, r0
144 xma.l fp0b_3
= ux
, v0
, f0
145 cmp.ne p12
, p13
= r0
, r0
147 xma.hu fp1a_3
= ux
, v0
, f0
150 xma.l fp0b_0
= uy
, v0
, f0
151 xma.hu fp1a_0
= uy
, v0
, f0
153 getfsig acc0
= fp0b_3
154 xma.l fp1b_3
= ux
, v1
, fp1a_3
155 xma.hu fp2a_3
= ux
, v1
, fp1a_3
157 xma.l fp0b_1
= u_1
, v0
, f0
158 xma.hu fp1a_1
= u_1
, v0
, f0
160 getfsig pr0_0
= fp0b_0
161 xma.l fp1b_0
= uy
, v1
, fp1a_0
162 xma.hu fp2a_0
= uy
, v1
, fp1a_0
164 getfsig pr1_3
= fp1b_3
165 getfsig acc1_3
= fp2a_3
166 xma.l fp0b_2
= u_2
, v0
, f0
167 xma.hu fp1a_2
= u_2
, v0
, f0
170 L
(gt4
): xma.l fp0b_0
= uy
, v0
, f0
171 xma.hu fp1a_0
= uy
, v0
, f0
173 getfsig acc0
= fp0b_3
174 xma.l fp1b_3
= ux
, v1
, fp1a_3
176 xma.hu fp2a_3
= ux
, v1
, fp1a_3
178 xma.l fp0b_1
= u_1
, v0
, f0
179 xma.hu fp1a_1
= u_1
, v0
, f0
181 getfsig pr0_0
= fp0b_0
182 xma.l fp1b_0
= uy
, v1
, fp1a_0
183 xma.hu fp2a_0
= uy
, v1
, fp1a_0
186 getfsig pr1_3
= fp1b_3
187 xma.l fp0b_2
= u_2
, v0
, f0
189 getfsig acc1_3
= fp2a_3
190 xma.hu fp1a_2
= u_2
, v0
, f0
195 L
(b01
): ldf8 u_0
= [up
], 8 C M
199 cmp.ne p6
, p7
= r0
, r0 C M I
201 xma.l fp0b_2
= ux
, v0
, f0 C F
202 cmp.ne p10
, p11
= r0
, r0 C M I
203 ldf8 u_1
= [up
], 8 C M
204 xma.hu fp1a_2
= ux
, v0
, f0 C F
206 xma.l fp0b_3
= uy
, v0
, f0 C F
207 xma.hu fp1a_3
= uy
, v0
, f0 C F
209 getfsig acc0
= fp0b_2 C M
210 xma.l fp1b_2
= ux
, v1
,fp1a_2 C F
211 ldf8 u_2
= [up
], 8 C M
212 xma.hu fp2a_2
= ux
, v1
,fp1a_2 C F
215 xma.l fp0b_0
= u_0
, v0
, f0 C F
216 xma.hu fp1a_0
= u_0
, v0
, f0 C F
218 getfsig pr0_3
= fp0b_3 C M
219 xma.l fp1b_3
= uy
, v1
,fp1a_3 C F
220 xma.hu fp2a_3
= uy
, v1
,fp1a_3 C F
222 getfsig pr1_2
= fp1b_2 C M
223 getfsig acc1_2
= fp2a_2 C M
224 xma.l fp0b_1
= u_1
, v0
, f0 C F
225 xma.hu fp1a_1
= u_1
, v0
, f0 C F
228 L
(gt5
): xma.l fp0b_0
= u_0
, v0
, f0
229 xma.hu fp1a_0
= u_0
, v0
, f0
231 getfsig pr0_3
= fp0b_3
232 xma.l fp1b_3
= uy
, v1
, fp1a_3
233 xma.hu fp2a_3
= uy
, v1
, fp1a_3
236 getfsig pr1_2
= fp1b_2
237 xma.l fp0b_1
= u_1
, v0
, f0
239 getfsig acc1_2
= fp2a_2
240 xma.hu fp1a_1
= u_1
, v0
, f0
245 L
(b10
): br.cloop.dptk L
(gt2
)
246 xma.l fp0b_1
= ux
, v0
, f0
247 xma.hu fp1a_1
= ux
, v0
, f0
249 xma.l fp0b_2
= uy
, v0
, f0
250 xma.hu fp1a_2
= uy
, v0
, f0
252 stf8
[rp
] = fp0b_1
, 8
253 xma.l fp1b_1
= ux
, v1
, fp1a_1
254 xma.hu fp2a_1
= ux
, v1
, fp1a_1
256 getfsig acc0
= fp0b_2
257 xma.l fp1b_2
= uy
, v1
, fp1a_2
258 xma.hu fp2a_2
= uy
, v1
, fp1a_2
260 getfsig pr1_1
= fp1b_1
261 getfsig acc1_1
= fp2a_1
263 getfsig pr1_2
= fp1b_2
269 cmp.ltu p8
, p9
= s0
, pr1_1
272 .pred.rel
"mutex", p8
, p9
273 (p8
) add acc0
= pr1_2
, acc1_1
, 1
274 (p9
) add acc0
= pr1_2
, acc1_1
275 (p8
) cmp.leu p10
, p0
= r31
, pr1_2
276 (p9
) cmp.ltu p10
, p0
= r31
, pr1_2
282 L
(gt2
): ldf8 u_3
= [up
], 8
287 xma.l fp0b_1
= ux
, v0
, f0
289 xma.hu fp1a_1
= ux
, v0
, f0
291 xma.l fp0b_2
= uy
, v0
, f0
292 xma.hu fp1a_2
= uy
, v0
, f0
294 getfsig acc0
= fp0b_1
295 xma.l fp1b_1
= ux
, v1
, fp1a_1
296 xma.hu fp2a_1
= ux
, v1
, fp1a_1
299 xma.l fp0b_3
= u_3
, v0
, f0
300 xma.hu fp1a_3
= u_3
, v0
, f0
302 getfsig pr0_2
= fp0b_2
303 xma.l fp1b_2
= uy
, v1
, fp1a_2
304 xma.hu fp2a_2
= uy
, v1
, fp1a_2
307 getfsig pr1_1
= fp1b_1
309 {.mfi; getfsig acc1_1 = fp2a_1
310 xma.l fp0b_0 = u_0, v0, f0
311 cmp.ne p8, p9 = r0, r0
312 }{.mfb; cmp.ne p12, p13 = r0, r0
313 xma.hu fp1a_0 = u_0, v0, f0
318 L
(b11
): mov acc1_3
= 0
322 cmp.ne p6
, p7
= r0
, r0
325 xma.l fp0b_0
= ux
, v0
, f0
326 xma.hu fp1a_0
= ux
, v0
, f0
328 cmp.ne p10
, p11
= r0
, r0
329 xma.l fp0b_1
= uy
, v0
, f0
330 xma.hu fp1a_1
= uy
, v0
, f0
332 getfsig acc0
= fp0b_0
333 xma.l fp1b_0
= ux
, v1
, fp1a_0
334 xma.hu fp2a_0
= ux
, v1
, fp1a_0
336 xma.l fp0b_2
= u_2
, v0
, f0
337 xma.hu fp1a_2
= u_2
, v0
, f0
339 getfsig pr0_1
= fp0b_1
340 xma.l fp1b_1
= uy
, v1
, fp1a_1
341 xma.hu fp2a_1
= uy
, v1
, fp1a_1
343 getfsig pr1_0
= fp1b_0
344 getfsig acc1_0
= fp2a_0
347 L
(gt3
): xma.l fp0b_0
= ux
, v0
, f0
348 cmp.ne p10
, p11
= r0
, r0
350 xma.hu fp1a_0
= ux
, v0
, f0
352 xma.l fp0b_1
= uy
, v0
, f0
353 xma.hu fp1a_1
= uy
, v0
, f0
355 getfsig acc0
= fp0b_0
356 xma.l fp1b_0
= ux
, v1
, fp1a_0
358 xma.hu fp2a_0
= ux
, v1
, fp1a_0
360 xma.l fp0b_2
= u_2
, v0
, f0
361 xma.hu fp1a_2
= u_2
, v0
, f0
363 getfsig pr0_1
= fp0b_1
364 xma.l fp1b_1
= uy
, v1
, fp1a_1
365 xma.hu fp2a_1
= uy
, v1
, fp1a_1
368 getfsig pr1_0
= fp1b_0
370 getfsig acc1_0
= fp2a_0
371 xma.l fp0b_3
= u_3
, v0
, f0
372 xma.hu fp1a_3
= u_3
, v0
, f0
376 C
*** MAIN
LOOP START
***
379 .pred.rel
"mutex", p8
, p9
380 .pred.rel
"mutex", p12
, p13
382 getfsig pr1_2
= fp1b_2
383 (p8
) cmp.leu p6
, p7
= acc0
, pr0_1
384 (p9
) cmp.ltu p6
, p7
= acc0
, pr0_1
385 (p12
) cmp.leu p10
, p11
= s0
, pr1_0
386 (p13
) cmp.ltu p10
, p11
= s0
, pr1_0
388 .pred.rel
"mutex", p6
, p7
389 getfsig acc1_2
= fp2a_2
391 xma.l fp0b_1
= u_1
, v0
, f0
392 (p6
) add acc0
= pr0_2
, acc1_0
, 1
393 (p7
) add acc0
= pr0_2
, acc1_0
394 xma.hu fp1a_1
= u_1
, v0
, f0
397 .pred.rel
"mutex", p10
, p11
398 getfsig pr0_0
= fp0b_0
399 xma.l fp1b_0
= u_0
, v1
, fp1a_0
400 (p10
) add s0
= pr1_1
, acc0
, 1
401 (p11
) add s0
= pr1_1
, acc0
402 xma.hu fp2a_0
= u_0
, v1
, fp1a_0
405 .pred.rel
"mutex", p6
, p7
406 .pred.rel
"mutex", p10
, p11
408 getfsig pr1_3
= fp1b_3
409 (p6
) cmp.leu p8
, p9
= acc0
, pr0_2
410 (p7
) cmp.ltu p8
, p9
= acc0
, pr0_2
411 (p10
) cmp.leu p12
, p13
= s0
, pr1_1
412 (p11
) cmp.ltu p12
, p13
= s0
, pr1_1
414 .pred.rel
"mutex", p8
, p9
415 getfsig acc1_3
= fp2a_3
417 xma.l fp0b_2
= u_2
, v0
, f0
418 (p8
) add acc0
= pr0_3
, acc1_1
, 1
419 (p9
) add acc0
= pr0_3
, acc1_1
420 xma.hu fp1a_2
= u_2
, v0
, f0
423 .pred.rel
"mutex", p12
, p13
424 getfsig pr0_1
= fp0b_1
425 xma.l fp1b_1
= u_1
, v1
, fp1a_1
426 (p12
) add s0
= pr1_2
, acc0
, 1
427 (p13
) add s0
= pr1_2
, acc0
428 xma.hu fp2a_1
= u_1
, v1
, fp1a_1
431 .pred.rel
"mutex", p8
, p9
432 .pred.rel
"mutex", p12
, p13
434 getfsig pr1_0
= fp1b_0
435 (p8
) cmp.leu p6
, p7
= acc0
, pr0_3
436 (p9
) cmp.ltu p6
, p7
= acc0
, pr0_3
437 (p12
) cmp.leu p10
, p11
= s0
, pr1_2
438 (p13
) cmp.ltu p10
, p11
= s0
, pr1_2
440 .pred.rel
"mutex", p6
, p7
441 getfsig acc1_0
= fp2a_0
443 xma.l fp0b_3
= u_3
, v0
, f0
444 (p6
) add acc0
= pr0_0
, acc1_2
, 1
445 (p7
) add acc0
= pr0_0
, acc1_2
446 xma.hu fp1a_3
= u_3
, v0
, f0
449 .pred.rel
"mutex", p10
, p11
450 getfsig pr0_2
= fp0b_2
451 xma.l fp1b_2
= u_2
, v1
, fp1a_2
452 (p10
) add s0
= pr1_3
, acc0
, 1
453 (p11
) add s0
= pr1_3
, acc0
454 xma.hu fp2a_2
= u_2
, v1
, fp1a_2
457 .pred.rel
"mutex", p6
, p7
458 .pred.rel
"mutex", p10
, p11
460 getfsig pr1_1
= fp1b_1
461 (p6
) cmp.leu p8
, p9
= acc0
, pr0_0
462 (p7
) cmp.ltu p8
, p9
= acc0
, pr0_0
463 (p10
) cmp.leu p12
, p13
= s0
, pr1_3
464 (p11
) cmp.ltu p12
, p13
= s0
, pr1_3
466 .pred.rel
"mutex", p8
, p9
467 getfsig acc1_1
= fp2a_1
469 xma.l fp0b_0
= u_0
, v0
, f0
470 (p8
) add acc0
= pr0_1
, acc1_3
, 1
471 (p9
) add acc0
= pr0_1
, acc1_3
472 xma.hu fp1a_0
= u_0
, v0
, f0
475 .pred.rel
"mutex", p12
, p13
476 getfsig pr0_3
= fp0b_3
477 xma.l fp1b_3
= u_3
, v1
, fp1a_3
478 (p12
) add s0
= pr1_0
, acc0
, 1
479 (p13
) add s0
= pr1_0
, acc0
480 xma.hu fp2a_3
= u_3
, v1
, fp1a_3
483 C
*** MAIN
LOOP END ***
485 .pred.rel
"mutex", p8
, p9
486 .pred.rel
"mutex", p12
, p13
487 {.mmi; getfsig pr1_2 = fp1b_2
489 (p8) cmp.leu p6, p7 = acc0, pr0_1
490 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
491 (p12) cmp.leu p10, p11 = s0, pr1_0
492 (p13) cmp.ltu p10, p11 = s0, pr1_0
494 } .pred.rel
"mutex", p6
, p7
495 {.mfi; getfsig acc1_2 = fp2a_2
496 xma.l fp0b_1 = u_1, v0, f0
498 }{.mmf; (p6) add acc0 = pr0_2, acc1_0, 1
499 (p7) add acc0 = pr0_2, acc1_0
500 xma.hu fp1a_1 = u_1, v0, f0
504 .pred.rel
"mutex", p10
, p11
505 {.mfi; getfsig pr0_0 = fp0b_0
506 xma.l fp1b_0 = u_0, v1, fp1a_0
507 (p10) add s0 = pr1_1, acc0, 1
508 }{.mfi; (p11) add s0 = pr1_1, acc0
509 xma.hu fp2a_0 = u_0, v1, fp1a_0
512 } .pred.rel
"mutex", p6
, p7
513 .pred.rel
"mutex", p10
, p11
514 {.mmi; getfsig pr1_3 = fp1b_3
516 (p6) cmp.leu p8, p9 = acc0, pr0_2
517 }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
518 (p10) cmp.leu p12, p13 = s0, pr1_1
519 (p11) cmp.ltu p12, p13 = s0, pr1_1
521 } .pred.rel
"mutex", p8
, p9
522 {.mfi; getfsig acc1_3 = fp2a_3
523 xma.l fp0b_2 = u_2, v0, f0
525 }{.mmf; (p8) add acc0 = pr0_3, acc1_1, 1
526 (p9) add acc0 = pr0_3, acc1_1
527 xma.hu fp1a_2 = u_2, v0, f0
531 .pred.rel
"mutex", p12
, p13
532 {.mfi; getfsig pr0_1 = fp0b_1
533 xma.l fp1b_1 = u_1, v1, fp1a_1
534 (p12) add s0 = pr1_2, acc0, 1
535 }{.mfi; (p13) add s0 = pr1_2, acc0
536 xma.hu fp2a_1 = u_1, v1, fp1a_1
539 } .pred.rel
"mutex", p8
, p9
540 .pred.rel
"mutex", p12
, p13
541 {.mmi; getfsig pr1_0 = fp1b_0
543 (p8) cmp.leu p6, p7 = acc0, pr0_3
544 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3
545 (p12) cmp.leu p10, p11 = s0, pr1_2
546 (p13) cmp.ltu p10, p11 = s0, pr1_2
548 } .pred.rel
"mutex", p6
, p7
549 {.mmi; getfsig acc1_0 = fp2a_0
550 (p6) add acc0 = pr0_0, acc1_2, 1
551 (p7) add acc0 = pr0_0, acc1_2
555 .pred.rel
"mutex", p10
, p11
556 {.mfi; getfsig pr0_2 = fp0b_2
557 xma.l fp1b_2 = u_2, v1, fp1a_2
558 (p10) add s0 = pr1_3, acc0, 1
559 }{.mfi; (p11) add s0 = pr1_3, acc0
560 xma.hu fp2a_2 = u_2, v1, fp1a_2
563 } .pred.rel
"mutex", p6
, p7
564 .pred.rel
"mutex", p10
, p11
565 {.mmi; getfsig pr1_1 = fp1b_1
567 (p6) cmp.leu p8, p9 = acc0, pr0_0
568 }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0
569 (p10) cmp.leu p12, p13 = s0, pr1_3
570 (p11) cmp.ltu p12, p13 = s0, pr1_3
572 } .pred.rel
"mutex", p8
, p9
573 {.mmi; getfsig acc1_1 = fp2a_1
574 (p8) add acc0 = pr0_1, acc1_3, 1
575 (p9) add acc0 = pr0_1, acc1_3
577 } .pred.rel
"mutex", p12
, p13
578 {.mmi; (p12) add s0 = pr1_0, acc0, 1
579 (p13) add s0 = pr1_0, acc0
582 } .pred.rel
"mutex", p8
, p9
583 .pred.rel
"mutex", p12
, p13
584 {.mmi; getfsig pr1_2 = fp1b_2
586 (p8) cmp.leu p6, p7 = acc0, pr0_1
587 }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1
588 (p12) cmp.leu p10, p11 = s0, pr1_0
589 (p13) cmp.ltu p10, p11 = s0, pr1_0
591 } .pred.rel
"mutex", p6
, p7
592 {.mmi; getfsig r8 = fp2a_2
593 (p6) add acc0 = pr0_2, acc1_0, 1
594 (p7) add acc0 = pr0_2, acc1_0
596 } .pred.rel
"mutex", p10
, p11
597 {.mmi; (p10) add s0 = pr1_1, acc0, 1
598 (p11) add s0 = pr1_1, acc0
599 (p6) cmp.leu p8, p9 = acc0, pr0_2
601 } .pred.rel
"mutex", p10
, p11
602 {.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2
603 (p10) cmp.leu p12, p13 = s0, pr1_1
604 (p11) cmp.ltu p12, p13 = s0, pr1_1
606 } .pred.rel
"mutex", p8
, p9
607 {.mmi; st8 [rp] = s0, 8
608 (p8) add acc0 = pr1_2, acc1_1, 1
609 (p9) add acc0 = pr1_2, acc1_1
611 } .pred.rel
"mutex", p8
, p9
612 {.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2
613 (p9) cmp.ltu p10, p11 = acc0, pr1_2
614 (p12) add acc0 = 1, acc0
616 }{.mmi; st8 [rp] = acc0, 8
617 (p12) cmpeqor p10, p0 = 0, acc0
620 }{.mib; (p10) add r8 = 1, r8