beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / ia64 / submul_1.asm
blobcb2a5525b5ab4379d7717f0067a64da4a75bc96d
1 dnl IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
2 dnl result from a second limb vector.
4 dnl Contributed to the GNU project by Torbjorn Granlund.
6 dnl Copyright 2000-2004 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
9 dnl
10 dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 dnl it under the terms of either:
12 dnl
13 dnl * the GNU Lesser General Public License as published by the Free
14 dnl Software Foundation; either version 3 of the License, or (at your
15 dnl option) any later version.
16 dnl
17 dnl or
18 dnl
19 dnl * the GNU General Public License as published by the Free Software
20 dnl Foundation; either version 2 of the License, or (at your option) any
21 dnl later version.
22 dnl
23 dnl or both in parallel, as here.
24 dnl
25 dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 dnl for more details.
29 dnl
30 dnl You should have received copies of the GNU General Public License and the
31 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 dnl see https://www.gnu.org/licenses/.
34 include(`../config.m4')
36 C cycles/limb
37 C Itanium: 4.0
38 C Itanium 2: 2.25 (alignment dependent, sometimes it seems to need 3 c/l)
40 C TODO
41 C * Optimize feed-in and wind-down code, both for speed and code size.
42 C * Handle low limb input and results specially, using a common stf8 in the
43 C epilogue.
44 C * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in
45 C 2nd bundle. This will allow the bbb bundle to be one cycle earlier and
46 C save a cycle.
48 C INPUT PARAMETERS
49 define(`rp', `r32')
50 define(`up', `r33')
51 define(`n', `r34')
52 define(`vl', `r35')
54 ASM_START()
55 PROLOGUE(mpn_submul_1)
56 .prologue
57 .save ar.lc, r2
58 .body
60 ifdef(`HAVE_ABI_32',
61 ` addp4 rp = 0, rp C M I
62 addp4 up = 0, up C M I
63 zxt4 n = n C I
66 {.mmi
67 mov r10 = rp C M I
68 mov r9 = up C M I
69 sub vl = r0, vl C M I negate vl
71 {.mmi
72 ldf8 f8 = [rp], 8 C M
73 ldf8 f7 = [up], 8 C M
74 add r19 = -1, n C M I n - 1
77 {.mmi
78 cmp.eq p6, p0 = 0, vl C M I
79 mov r8 = 0 C M I zero cylimb
80 mov r2 = ar.lc C I0
82 {.mmi
83 setf.sig f6 = vl C M2 M3
84 and r14 = 3, n C M I
85 shr.u r19 = r19, 2 C I0
88 {.mmb
89 nop 0
90 cmp.eq p10, p0 = 0, r14 C M I
91 (p6) br.spnt .Ldone C B vl == 0
93 {.mmi
94 cmp.eq p11, p0 = 2, r14 C M I
95 cmp.eq p12, p0 = 3, r14 C M I
96 mov ar.lc = r19 C I0
98 {.bbb
99 (p10) br.dptk .Lb00 C B
100 (p11) br.dptk .Lb10 C B
101 (p12) br.dptk .Lb11 C B
105 .Lb01: br.cloop.dptk .grt1
107 xma.l f39 = f7, f6, f8
108 xma.hu f43 = f7, f6, f8
110 getf.sig r27 = f39 C lo
111 getf.sig r31 = f43 C hi
112 ld8 r20 = [r9], 8
113 br .Lcj1
115 .grt1: ldf8 f44 = [rp], 8
116 ldf8 f32 = [up], 8
118 ldf8 f45 = [rp], 8
119 ldf8 f33 = [up], 8
121 ldf8 f46 = [rp], 8
122 xma.l f39 = f7, f6, f8
123 ldf8 f34 = [up], 8
124 xma.hu f43 = f7, f6, f8
126 ldf8 f47 = [rp], 8
127 xma.l f36 = f32, f6, f44
128 ldf8 f35 = [up], 8
129 xma.hu f40 = f32, f6, f44
130 br.cloop.dptk .grt5
133 getf.sig r27 = f39 C lo
134 xma.l f37 = f33, f6, f45
135 ld8 r20 = [r9], 8
136 xma.hu f41 = f33, f6, f45
138 getf.sig r31 = f43 C hi
139 getf.sig r24 = f36 C lo
140 xma.l f38 = f34, f6, f46
141 ld8 r21 = [r9], 8
142 xma.hu f42 = f34, f6, f46
144 getf.sig r28 = f40 C hi
145 getf.sig r25 = f37 C lo
146 xma.l f39 = f35, f6, f47
147 ld8 r22 = [r9], 8
148 xma.hu f43 = f35, f6, f47
150 getf.sig r29 = f41 C hi
151 getf.sig r26 = f38 C lo
152 ld8 r23 = [r9], 8
153 br .Lcj5
155 .grt5: ldf8 f44 = [rp], 8
156 ldf8 f32 = [up], 8
158 getf.sig r27 = f39 C lo
159 xma.l f37 = f33, f6, f45
160 ld8 r20 = [r9], 8
161 xma.hu f41 = f33, f6, f45
163 ldf8 f45 = [rp], 8
164 getf.sig r31 = f43 C hi
165 ldf8 f33 = [up], 8
167 getf.sig r24 = f36 C lo
168 xma.l f38 = f34, f6, f46
169 ld8 r21 = [r9], 8
170 xma.hu f42 = f34, f6, f46
172 ldf8 f46 = [rp], 8
173 getf.sig r28 = f40 C hi
174 ldf8 f34 = [up], 8
176 getf.sig r25 = f37 C lo
177 xma.l f39 = f35, f6, f47
178 ld8 r22 = [r9], 8
179 xma.hu f43 = f35, f6, f47
181 ldf8 f47 = [rp], 8
182 getf.sig r29 = f41 C hi
183 ldf8 f35 = [up], 8
185 getf.sig r26 = f38 C lo
186 xma.l f36 = f32, f6, f44
187 ld8 r23 = [r9], 8
188 xma.hu f40 = f32, f6, f44
189 br.cloop.dptk .Loop
190 br .Lend
193 .Lb10: ldf8 f47 = [rp], 8
194 ldf8 f35 = [up], 8
195 br.cloop.dptk .grt2
197 xma.l f38 = f7, f6, f8
198 xma.hu f42 = f7, f6, f8
200 xma.l f39 = f35, f6, f47
201 xma.hu f43 = f35, f6, f47
203 getf.sig r26 = f38 C lo
204 getf.sig r30 = f42 C hi
205 ld8 r23 = [r9], 8
207 getf.sig r27 = f39 C lo
208 getf.sig r31 = f43 C hi
209 ld8 r20 = [r9], 8
210 br .Lcj2
212 .grt2: ldf8 f44 = [rp], 8
213 ldf8 f32 = [up], 8
215 ldf8 f45 = [rp], 8
216 ldf8 f33 = [up], 8
217 xma.l f38 = f7, f6, f8
218 xma.hu f42 = f7, f6, f8
220 ldf8 f46 = [rp], 8
221 ldf8 f34 = [up], 8
222 xma.l f39 = f35, f6, f47
223 xma.hu f43 = f35, f6, f47
225 ldf8 f47 = [rp], 8
226 ldf8 f35 = [up], 8
228 getf.sig r26 = f38 C lo
229 xma.l f36 = f32, f6, f44
230 ld8 r23 = [r9], 8
231 xma.hu f40 = f32, f6, f44
232 br.cloop.dptk .grt6
234 getf.sig r30 = f42 C hi
236 getf.sig r27 = f39 C lo
237 xma.l f37 = f33, f6, f45
238 ld8 r20 = [r9], 8
239 xma.hu f41 = f33, f6, f45
241 getf.sig r31 = f43 C hi
242 getf.sig r24 = f36 C lo
243 xma.l f38 = f34, f6, f46
244 ld8 r21 = [r9], 8
245 xma.hu f42 = f34, f6, f46
247 getf.sig r28 = f40 C hi
248 getf.sig r25 = f37 C lo
249 xma.l f39 = f35, f6, f47
250 ld8 r22 = [r9], 8
251 xma.hu f43 = f35, f6, f47
252 br .Lcj6
254 .grt6: ldf8 f44 = [rp], 8
255 getf.sig r30 = f42 C hi
256 ldf8 f32 = [up], 8
258 getf.sig r27 = f39 C lo
259 xma.l f37 = f33, f6, f45
260 ld8 r20 = [r9], 8
261 xma.hu f41 = f33, f6, f45
263 ldf8 f45 = [rp], 8
264 getf.sig r31 = f43 C hi
265 ldf8 f33 = [up], 8
267 getf.sig r24 = f36 C lo
268 xma.l f38 = f34, f6, f46
269 ld8 r21 = [r9], 8
270 xma.hu f42 = f34, f6, f46
272 ldf8 f46 = [rp], 8
273 getf.sig r28 = f40 C hi
274 ldf8 f34 = [up], 8
276 getf.sig r25 = f37 C lo
277 xma.l f39 = f35, f6, f47
278 ld8 r22 = [r9], 8
279 xma.hu f43 = f35, f6, f47
280 br .LL10
283 .Lb11: ldf8 f46 = [rp], 8
284 ldf8 f34 = [up], 8
286 ldf8 f47 = [rp], 8
287 ldf8 f35 = [up], 8
288 br.cloop.dptk .grt3
290 xma.l f37 = f7, f6, f8
291 xma.hu f41 = f7, f6, f8
293 xma.l f38 = f34, f6, f46
294 xma.hu f42 = f34, f6, f46
296 getf.sig r25 = f37 C lo
297 xma.l f39 = f35, f6, f47
298 xma.hu f43 = f35, f6, f47
300 getf.sig r29 = f41 C hi
301 ld8 r22 = [r9], 8
303 getf.sig r26 = f38 C lo
304 getf.sig r30 = f42 C hi
305 ld8 r23 = [r9], 8
307 getf.sig r27 = f39 C lo
308 getf.sig r31 = f43 C hi
309 ld8 r20 = [r9], 8
310 br .Lcj3
312 .grt3: ldf8 f44 = [rp], 8
313 xma.l f37 = f7, f6, f8
314 ldf8 f32 = [up], 8
315 xma.hu f41 = f7, f6, f8
317 ldf8 f45 = [rp], 8
318 xma.l f38 = f34, f6, f46
319 ldf8 f33 = [up], 8
320 xma.hu f42 = f34, f6, f46
322 ldf8 f46 = [rp], 8
323 ldf8 f34 = [up], 8
325 getf.sig r25 = f37 C lo
326 xma.l f39 = f35, f6, f47
327 ld8 r22 = [r9], 8
328 xma.hu f43 = f35, f6, f47
330 ldf8 f47 = [rp], 8
331 getf.sig r29 = f41 C hi
332 ldf8 f35 = [up], 8
334 getf.sig r26 = f38 C lo
335 xma.l f36 = f32, f6, f44
336 ld8 r23 = [r9], 8
337 xma.hu f40 = f32, f6, f44
338 br.cloop.dptk .grt7
341 getf.sig r30 = f42 C hi
342 getf.sig r27 = f39 C lo
343 xma.l f37 = f33, f6, f45
344 ld8 r20 = [r9], 8
345 xma.hu f41 = f33, f6, f45
347 getf.sig r31 = f43 C hi
348 getf.sig r24 = f36 C lo
349 xma.l f38 = f34, f6, f46
350 ld8 r21 = [r9], 8
351 xma.hu f42 = f34, f6, f46
352 br .Lcj7
354 .grt7: ldf8 f44 = [rp], 8
355 getf.sig r30 = f42 C hi
356 ldf8 f32 = [up], 8
358 getf.sig r27 = f39 C lo
359 xma.l f37 = f33, f6, f45
360 ld8 r20 = [r9], 8
361 xma.hu f41 = f33, f6, f45
363 ldf8 f45 = [rp], 8
364 getf.sig r31 = f43 C hi
365 ldf8 f33 = [up], 8
367 getf.sig r24 = f36 C lo
368 xma.l f38 = f34, f6, f46
369 ld8 r21 = [r9], 8
370 xma.hu f42 = f34, f6, f46
371 br .LL11
374 .Lb00: ldf8 f45 = [rp], 8
375 ldf8 f33 = [up], 8
377 ldf8 f46 = [rp], 8
378 ldf8 f34 = [up], 8
380 ldf8 f47 = [rp], 8
381 xma.l f36 = f7, f6, f8
382 ldf8 f35 = [up], 8
383 xma.hu f40 = f7, f6, f8
384 br.cloop.dptk .grt4
386 xma.l f37 = f33, f6, f45
387 xma.hu f41 = f33, f6, f45
389 getf.sig r24 = f36 C lo
390 xma.l f38 = f34, f6, f46
391 ld8 r21 = [r9], 8
392 xma.hu f42 = f34, f6, f46
394 getf.sig r28 = f40 C hi
395 xma.l f39 = f35, f6, f47
396 getf.sig r25 = f37 C lo
397 ld8 r22 = [r9], 8
398 xma.hu f43 = f35, f6, f47
400 getf.sig r29 = f41 C hi
401 getf.sig r26 = f38 C lo
402 ld8 r23 = [r9], 8
404 getf.sig r30 = f42 C hi
405 getf.sig r27 = f39 C lo
406 ld8 r20 = [r9], 8
407 br .Lcj4
409 .grt4: ldf8 f44 = [rp], 8
410 xma.l f37 = f33, f6, f45
411 ldf8 f32 = [up], 8
412 xma.hu f41 = f33, f6, f45
414 ldf8 f45 = [rp], 8
415 ldf8 f33 = [up], 8
416 xma.l f38 = f34, f6, f46
417 getf.sig r24 = f36 C lo
418 ld8 r21 = [r9], 8
419 xma.hu f42 = f34, f6, f46
421 ldf8 f46 = [rp], 8
422 getf.sig r28 = f40 C hi
423 ldf8 f34 = [up], 8
424 xma.l f39 = f35, f6, f47
425 getf.sig r25 = f37 C lo
426 ld8 r22 = [r9], 8
427 xma.hu f43 = f35, f6, f47
429 ldf8 f47 = [rp], 8
430 getf.sig r29 = f41 C hi
431 ldf8 f35 = [up], 8
433 getf.sig r26 = f38 C lo
434 xma.l f36 = f32, f6, f44
435 ld8 r23 = [r9], 8
436 xma.hu f40 = f32, f6, f44
437 br.cloop.dptk .grt8
440 getf.sig r30 = f42 C hi
441 getf.sig r27 = f39 C lo
442 xma.l f37 = f33, f6, f45
443 ld8 r20 = [r9], 8
444 xma.hu f41 = f33, f6, f45
445 br .Lcj8
447 .grt8: ldf8 f44 = [rp], 8
448 getf.sig r30 = f42 C hi
449 ldf8 f32 = [up], 8
451 getf.sig r27 = f39 C lo
452 xma.l f37 = f33, f6, f45
453 ld8 r20 = [r9], 8
454 xma.hu f41 = f33, f6, f45
455 br .LL00
457 ALIGN(32)
458 .Loop:
459 {.mmi
460 ldf8 f44 = [rp], 8
461 cmp.ltu p6, p0 = r27, r8 C lo cmp
462 sub r14 = r27, r8 C lo sub
464 {.mmi
465 getf.sig r30 = f42 C hi
466 ldf8 f32 = [up], 8
467 sub r8 = r20, r31 C hi sub
468 ;; C 01
470 {.mmf
471 getf.sig r27 = f39 C lo
472 st8 [r10] = r14, 8
473 xma.l f37 = f33, f6, f45
475 {.mfi
476 ld8 r20 = [r9], 8
477 xma.hu f41 = f33, f6, f45
478 (p6) add r8 = 1, r8
479 ;; C 02
481 {.mmi
482 .LL00: ldf8 f45 = [rp], 8
483 cmp.ltu p6, p0 = r24, r8
484 sub r14 = r24, r8
486 {.mmi
487 getf.sig r31 = f43 C hi
488 ldf8 f33 = [up], 8
489 sub r8 = r21, r28
490 ;; C 03
492 {.mmf
493 getf.sig r24 = f36 C lo
494 st8 [r10] = r14, 8
495 xma.l f38 = f34, f6, f46
497 {.mfi
498 ld8 r21 = [r9], 8
499 xma.hu f42 = f34, f6, f46
500 (p6) add r8 = 1, r8
501 ;; C 04
503 {.mmi
504 .LL11: ldf8 f46 = [rp], 8
505 cmp.ltu p6, p0 = r25, r8
506 sub r14 = r25, r8
508 {.mmi
509 getf.sig r28 = f40 C hi
510 ldf8 f34 = [up], 8
511 sub r8 = r22, r29
512 ;; C 05
514 {.mmf
515 getf.sig r25 = f37 C lo
516 st8 [r10] = r14, 8
517 xma.l f39 = f35, f6, f47
519 {.mfi
520 ld8 r22 = [r9], 8
521 xma.hu f43 = f35, f6, f47
522 (p6) add r8 = 1, r8
523 ;; C 06
525 {.mmi
526 .LL10: ldf8 f47 = [rp], 8
527 cmp.ltu p6, p0 = r26, r8
528 sub r14 = r26, r8
530 {.mmi
531 getf.sig r29 = f41 C hi
532 ldf8 f35 = [up], 8
533 sub r8 = r23, r30
534 ;; C 07
536 {.mmf
537 getf.sig r26 = f38 C lo
538 st8 [r10] = r14, 8
539 xma.l f36 = f32, f6, f44
541 {.mfi
542 ld8 r23 = [r9], 8
543 xma.hu f40 = f32, f6, f44
544 (p6) add r8 = 1, r8
546 br.cloop.dptk .Loop
549 .Lend:
550 cmp.ltu p6, p0 = r27, r8
551 sub r14 = r27, r8
552 getf.sig r30 = f42
553 sub r8 = r20, r31
555 getf.sig r27 = f39
556 st8 [r10] = r14, 8
557 xma.l f37 = f33, f6, f45
558 ld8 r20 = [r9], 8
559 xma.hu f41 = f33, f6, f45
560 (p6) add r8 = 1, r8
562 .Lcj8:
563 cmp.ltu p6, p0 = r24, r8
564 sub r14 = r24, r8
565 getf.sig r31 = f43
566 sub r8 = r21, r28
568 getf.sig r24 = f36
569 st8 [r10] = r14, 8
570 xma.l f38 = f34, f6, f46
571 ld8 r21 = [r9], 8
572 xma.hu f42 = f34, f6, f46
573 (p6) add r8 = 1, r8
575 .Lcj7:
576 cmp.ltu p6, p0 = r25, r8
577 sub r14 = r25, r8
578 getf.sig r28 = f40
579 sub r8 = r22, r29
581 getf.sig r25 = f37
582 st8 [r10] = r14, 8
583 xma.l f39 = f35, f6, f47
584 ld8 r22 = [r9], 8
585 xma.hu f43 = f35, f6, f47
586 (p6) add r8 = 1, r8
588 .Lcj6:
589 cmp.ltu p6, p0 = r26, r8
590 sub r14 = r26, r8
591 getf.sig r29 = f41
592 sub r8 = r23, r30
594 getf.sig r26 = f38
595 st8 [r10] = r14, 8
596 ld8 r23 = [r9], 8
597 (p6) add r8 = 1, r8
599 .Lcj5:
600 cmp.ltu p6, p0 = r27, r8
601 sub r14 = r27, r8
602 getf.sig r30 = f42
603 sub r8 = r20, r31
605 getf.sig r27 = f39
606 st8 [r10] = r14, 8
607 ld8 r20 = [r9], 8
608 (p6) add r8 = 1, r8
610 .Lcj4:
611 cmp.ltu p6, p0 = r24, r8
612 sub r14 = r24, r8
613 getf.sig r31 = f43
614 sub r8 = r21, r28
616 st8 [r10] = r14, 8
617 (p6) add r8 = 1, r8
619 .Lcj3:
620 cmp.ltu p6, p0 = r25, r8
621 sub r14 = r25, r8
622 sub r8 = r22, r29
624 st8 [r10] = r14, 8
625 (p6) add r8 = 1, r8
627 .Lcj2:
628 cmp.ltu p6, p0 = r26, r8
629 sub r14 = r26, r8
630 sub r8 = r23, r30
632 st8 [r10] = r14, 8
633 (p6) add r8 = 1, r8
635 .Lcj1:
636 cmp.ltu p6, p0 = r27, r8
637 sub r14 = r27, r8
638 sub r8 = r20, r31
640 st8 [r10] = r14, 8
641 mov ar.lc = r2
642 (p6) add r8 = 1, r8
643 br.ret.sptk.many b0
644 .Ldone: mov ar.lc = r2
645 br.ret.sptk.many b0
646 EPILOGUE()
647 ASM_END()