beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / ia64 / addmul_1.asm
blobffa3297763cdc374d61d99ea11018e2925bd5834
1 dnl IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
2 dnl result to a second limb vector.
4 dnl Contributed to the GNU project by Torbjorn Granlund.
6 dnl Copyright 2000-2005, 2007 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
9 dnl
10 dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 dnl it under the terms of either:
12 dnl
13 dnl * the GNU Lesser General Public License as published by the Free
14 dnl Software Foundation; either version 3 of the License, or (at your
15 dnl option) any later version.
16 dnl
17 dnl or
18 dnl
19 dnl * the GNU General Public License as published by the Free Software
20 dnl Foundation; either version 2 of the License, or (at your option) any
21 dnl later version.
22 dnl
23 dnl or both in parallel, as here.
24 dnl
25 dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 dnl for more details.
29 dnl
30 dnl You should have received copies of the GNU General Public License and the
31 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 dnl see https://www.gnu.org/licenses/.
34 include(`../config.m4')
36 C cycles/limb
37 C Itanium: 3.0
38 C Itanium 2: 2.0
40 C TODO
41 C * Further optimize feed-in and wind-down code, both for speed and code size.
42 C * Handle low limb input and results specially, using a common stf8 in the
43 C epilogue.
44 C * Use 1 c/l carry propagation scheme in wind-down code.
45 C * Use extra pointer registers for `up' and rp to speed up feed-in loads.
46 C * Work out final differences with mul_1.asm. That function is 300 bytes
47 C smaller than this due to better loop scheduling and thus simpler feed-in
48 C code.
50 C INPUT PARAMETERS
51 define(`rp', `r32')
52 define(`up', `r33')
53 define(`n', `r34')
54 define(`vl', `r35')
56 ASM_START()
57 PROLOGUE(mpn_addmul_1)
58 .prologue
59 .save ar.lc, r2
60 .body
62 ifdef(`HAVE_ABI_32',
63 ` addp4 rp = 0, rp C M I
64 addp4 up = 0, up C M I
65 zxt4 n = n C I
68 {.mmi
69 adds r15 = -1, n C M I
70 mov r20 = rp C M I
71 mov.i r2 = ar.lc C I0
73 {.mmi
74 ldf8 f7 = [up], 8 C M
75 ldf8 f8 = [rp], 8 C M
76 and r14 = 3, n C M I
79 {.mmi
80 setf.sig f6 = vl C M2 M3
81 cmp.eq p10, p0 = 0, r14 C M I
82 shr.u r31 = r15, 2 C I0
84 {.mmi
85 cmp.eq p11, p0 = 2, r14 C M I
86 cmp.eq p12, p0 = 3, r14 C M I
87 nop.i 0 C I
90 {.mii
91 cmp.ne p6, p7 = r0, r0 C M I
92 mov.i ar.lc = r31 C I0
93 cmp.ne p8, p9 = r0, r0 C M I
95 {.bbb
96 (p10) br.dptk .Lb00 C B
97 (p11) br.dptk .Lb10 C B
98 (p12) br.dptk .Lb11 C B
102 .Lb01: br.cloop.dptk .grt1 C B
104 xma.l f39 = f7, f6, f8 C F
105 xma.hu f43 = f7, f6, f8 C F
107 getf.sig r8 = f43 C M2
108 stf8 [r20] = f39 C M2 M3
109 mov.i ar.lc = r2 C I0
110 br.ret.sptk.many b0 C B
112 .grt1:
113 ldf8 f32 = [up], 8
114 ldf8 f44 = [rp], 8
116 ldf8 f33 = [up], 8
117 ldf8 f45 = [rp], 8
119 ldf8 f34 = [up], 8
120 xma.l f39 = f7, f6, f8
121 ldf8 f46 = [rp], 8
122 xma.hu f43 = f7, f6, f8
124 ldf8 f35 = [up], 8
125 ldf8 f47 = [rp], 8
126 br.cloop.dptk .grt5
128 xma.l f36 = f32, f6, f44
129 xma.hu f40 = f32, f6, f44
131 stf8 [r20] = f39, 8
132 xma.l f37 = f33, f6, f45
133 xma.hu f41 = f33, f6, f45
135 getf.sig r31 = f43
136 getf.sig r24 = f36
137 xma.l f38 = f34, f6, f46
138 xma.hu f42 = f34, f6, f46
140 getf.sig r28 = f40
141 getf.sig r25 = f37
142 xma.l f39 = f35, f6, f47
143 xma.hu f43 = f35, f6, f47
145 getf.sig r29 = f41
146 getf.sig r26 = f38
147 br .Lcj5
149 .grt5:
150 mov r30 = 0
151 xma.l f36 = f32, f6, f44
152 xma.hu f40 = f32, f6, f44
154 ldf8 f32 = [up], 8
155 xma.l f37 = f33, f6, f45
156 ldf8 f44 = [rp], 8
157 xma.hu f41 = f33, f6, f45
159 ldf8 f33 = [up], 8
160 getf.sig r27 = f39
162 getf.sig r31 = f43
163 xma.l f38 = f34, f6, f46
164 ldf8 f45 = [rp], 8
165 xma.hu f42 = f34, f6, f46
167 ldf8 f34 = [up], 8
168 getf.sig r24 = f36
170 getf.sig r28 = f40
171 xma.l f39 = f35, f6, f47
172 ldf8 f46 = [rp], 8
173 xma.hu f43 = f35, f6, f47
175 ldf8 f35 = [up], 8
176 getf.sig r25 = f37
177 br.cloop.dptk .Loop
178 br .Le0
181 .Lb10: ldf8 f35 = [up], 8
182 ldf8 f47 = [rp], 8
183 br.cloop.dptk .grt2
185 xma.l f38 = f7, f6, f8
186 xma.hu f42 = f7, f6, f8
188 xma.l f39 = f35, f6, f47
189 xma.hu f43 = f35, f6, f47
191 getf.sig r30 = f42
192 stf8 [r20] = f38, 8
193 getf.sig r27 = f39
194 getf.sig r8 = f43
195 br .Lcj2
197 .grt2:
198 ldf8 f32 = [up], 8
199 ldf8 f44 = [rp], 8
201 ldf8 f33 = [up], 8
202 xma.l f38 = f7, f6, f8
203 ldf8 f45 = [rp], 8
204 xma.hu f42 = f7, f6, f8
206 ldf8 f34 = [up], 8
207 xma.l f39 = f35, f6, f47
208 ldf8 f46 = [rp], 8
209 xma.hu f43 = f35, f6, f47
211 ldf8 f35 = [up], 8
212 ldf8 f47 = [rp], 8
213 br.cloop.dptk .grt6
215 stf8 [r20] = f38, 8
216 xma.l f36 = f32, f6, f44
217 xma.hu f40 = f32, f6, f44
219 getf.sig r30 = f42
220 getf.sig r27 = f39
221 xma.l f37 = f33, f6, f45
222 xma.hu f41 = f33, f6, f45
224 getf.sig r31 = f43
225 getf.sig r24 = f36
226 xma.l f38 = f34, f6, f46
227 xma.hu f42 = f34, f6, f46
229 getf.sig r28 = f40
230 getf.sig r25 = f37
231 xma.l f39 = f35, f6, f47
232 xma.hu f43 = f35, f6, f47
233 br .Lcj6
235 .grt6:
236 mov r29 = 0
237 xma.l f36 = f32, f6, f44
238 xma.hu f40 = f32, f6, f44
240 ldf8 f32 = [up], 8
241 getf.sig r26 = f38
243 getf.sig r30 = f42
244 xma.l f37 = f33, f6, f45
245 ldf8 f44 = [rp], 8
246 xma.hu f41 = f33, f6, f45
248 ldf8 f33 = [up], 8
249 getf.sig r27 = f39
251 getf.sig r31 = f43
252 xma.l f38 = f34, f6, f46
253 ldf8 f45 = [rp], 8
254 xma.hu f42 = f34, f6, f46
256 ldf8 f34 = [up], 8
257 getf.sig r24 = f36
258 br .LL10
261 .Lb11: ldf8 f34 = [up], 8
262 ldf8 f46 = [rp], 8
264 ldf8 f35 = [up], 8
265 ldf8 f47 = [rp], 8
266 br.cloop.dptk .grt3
269 xma.l f37 = f7, f6, f8
270 xma.hu f41 = f7, f6, f8
271 xma.l f38 = f34, f6, f46
272 xma.hu f42 = f34, f6, f46
273 xma.l f39 = f35, f6, f47
274 xma.hu f43 = f35, f6, f47
276 getf.sig r29 = f41
277 stf8 [r20] = f37, 8
278 getf.sig r26 = f38
279 getf.sig r30 = f42
280 getf.sig r27 = f39
281 getf.sig r8 = f43
282 br .Lcj3
284 .grt3:
285 ldf8 f32 = [up], 8
286 xma.l f37 = f7, f6, f8
287 ldf8 f44 = [rp], 8
288 xma.hu f41 = f7, f6, f8
290 ldf8 f33 = [up], 8
291 xma.l f38 = f34, f6, f46
292 ldf8 f45 = [rp], 8
293 xma.hu f42 = f34, f6, f46
295 ldf8 f34 = [up], 8
296 xma.l f39 = f35, f6, f47
297 ldf8 f46 = [rp], 8
298 xma.hu f43 = f35, f6, f47
300 ldf8 f35 = [up], 8
301 getf.sig r25 = f37 C FIXME
302 ldf8 f47 = [rp], 8
303 br.cloop.dptk .grt7
305 getf.sig r29 = f41
306 stf8 [r20] = f37, 8 C FIXME
307 xma.l f36 = f32, f6, f44
308 getf.sig r26 = f38
309 xma.hu f40 = f32, f6, f44
311 getf.sig r30 = f42
312 xma.l f37 = f33, f6, f45
313 getf.sig r27 = f39
314 xma.hu f41 = f33, f6, f45
316 getf.sig r31 = f43
317 xma.l f38 = f34, f6, f46
318 getf.sig r24 = f36
319 xma.hu f42 = f34, f6, f46
320 br .Lcj7
322 .grt7:
323 getf.sig r29 = f41
324 xma.l f36 = f32, f6, f44
325 mov r28 = 0
326 xma.hu f40 = f32, f6, f44
328 ldf8 f32 = [up], 8
329 getf.sig r26 = f38
331 getf.sig r30 = f42
332 xma.l f37 = f33, f6, f45
333 ldf8 f44 = [rp], 8
334 xma.hu f41 = f33, f6, f45
336 ldf8 f33 = [up], 8
337 getf.sig r27 = f39
338 br .LL11
341 .Lb00: ldf8 f33 = [up], 8
342 ldf8 f45 = [rp], 8
344 ldf8 f34 = [up], 8
345 ldf8 f46 = [rp], 8
347 ldf8 f35 = [up], 8
348 xma.l f36 = f7, f6, f8
349 ldf8 f47 = [rp], 8
350 xma.hu f40 = f7, f6, f8
351 br.cloop.dptk .grt4
353 xma.l f37 = f33, f6, f45
354 xma.hu f41 = f33, f6, f45
355 xma.l f38 = f34, f6, f46
356 xma.hu f42 = f34, f6, f46
358 getf.sig r28 = f40
359 stf8 [r20] = f36, 8
360 xma.l f39 = f35, f6, f47
361 getf.sig r25 = f37
362 xma.hu f43 = f35, f6, f47
364 getf.sig r29 = f41
365 getf.sig r26 = f38
366 getf.sig r30 = f42
367 getf.sig r27 = f39
368 br .Lcj4
370 .grt4:
371 ldf8 f32 = [up], 8
372 xma.l f37 = f33, f6, f45
373 ldf8 f44 = [rp], 8
374 xma.hu f41 = f33, f6, f45
376 ldf8 f33 = [up], 8
377 xma.l f38 = f34, f6, f46
378 ldf8 f45 = [rp], 8
379 xma.hu f42 = f34, f6, f46
381 ldf8 f34 = [up], 8
382 getf.sig r24 = f36 C FIXME
383 xma.l f39 = f35, f6, f47
384 ldf8 f46 = [rp], 8
385 getf.sig r28 = f40
386 xma.hu f43 = f35, f6, f47
388 ldf8 f35 = [up], 8
389 getf.sig r25 = f37
390 ldf8 f47 = [rp], 8
391 br.cloop.dptk .grt8
393 getf.sig r29 = f41
394 stf8 [r20] = f36, 8 C FIXME
395 xma.l f36 = f32, f6, f44
396 getf.sig r26 = f38
397 getf.sig r30 = f42
398 xma.hu f40 = f32, f6, f44
400 xma.l f37 = f33, f6, f45
401 getf.sig r27 = f39
402 xma.hu f41 = f33, f6, f45
403 br .Lcj8
405 .grt8:
406 getf.sig r29 = f41
407 xma.l f36 = f32, f6, f44
408 mov r31 = 0
409 xma.hu f40 = f32, f6, f44
411 ldf8 f32 = [up], 8
412 getf.sig r26 = f38
413 br .LL00
416 C *** MAIN LOOP START ***
417 ALIGN(32) C insn fed cycle #
418 .Loop:
419 .pred.rel "mutex", p6, p7 C num by i1 i2
420 getf.sig r29 = f41 C 00 16 0 0
421 xma.l f36 = f32, f6, f44 C 01 06,15 0 0
422 (p6) add r14 = r30, r27, 1 C 02 0 0
423 ldf8 f47 = [rp], 8 C 03 0 0
424 xma.hu f40 = f32, f6, f44 C 04 06,15 0 0
425 (p7) add r14 = r30, r27 C 05 0 0
427 .pred.rel "mutex", p6, p7
428 ldf8 f32 = [up], 8 C 06 1 1
429 (p6) cmp.leu p8, p9 = r14, r27 C 07 1 1
430 (p7) cmp.ltu p8, p9 = r14, r27 C 08 1 1
431 getf.sig r26 = f38 C 09 25 2 1
432 st8 [r20] = r14, 8 C 10 2 1
433 nop.b 0 C 11 2 1
435 .LL00:
436 .pred.rel "mutex", p8, p9
437 getf.sig r30 = f42 C 12 28 3 2
438 xma.l f37 = f33, f6, f45 C 13 18,27 3 2
439 (p8) add r16 = r31, r24, 1 C 14 3 2
440 ldf8 f44 = [rp], 8 C 15 3 2
441 xma.hu f41 = f33, f6, f45 C 16 18,27 3 2
442 (p9) add r16 = r31, r24 C 17 3 2
444 .pred.rel "mutex", p8, p9
445 ldf8 f33 = [up], 8 C 18 4 3
446 (p8) cmp.leu p6, p7 = r16, r24 C 19 4 3
447 (p9) cmp.ltu p6, p7 = r16, r24 C 20 4 3
448 getf.sig r27 = f39 C 21 37 5 3
449 st8 [r20] = r16, 8 C 22 5 3
450 nop.b 0 C 23 5 3
452 .LL11:
453 .pred.rel "mutex", p6, p7
454 getf.sig r31 = f43 C 24 40 6 4
455 xma.l f38 = f34, f6, f46 C 25 30,39 6 4
456 (p6) add r14 = r28, r25, 1 C 26 6 4
457 ldf8 f45 = [rp], 8 C 27 6 4
458 xma.hu f42 = f34, f6, f46 C 28 30,39 6 4
459 (p7) add r14 = r28, r25 C 29 6 4
461 .pred.rel "mutex", p6, p7
462 ldf8 f34 = [up], 8 C 30 7 5
463 (p6) cmp.leu p8, p9 = r14, r25 C 31 7 5
464 (p7) cmp.ltu p8, p9 = r14, r25 C 32 7 5
465 getf.sig r24 = f36 C 33 01 8 5
466 st8 [r20] = r14, 8 C 34 8 5
467 nop.b 0 C 35 8 5
469 .LL10:
470 .pred.rel "mutex", p8, p9
471 getf.sig r28 = f40 C 36 04 9 6
472 xma.l f39 = f35, f6, f47 C 37 42,03 9 6
473 (p8) add r16 = r29, r26, 1 C 38 9 6
474 ldf8 f46 = [rp], 8 C 39 9 6
475 xma.hu f43 = f35, f6, f47 C 40 42,03 9 6
476 (p9) add r16 = r29, r26 C 41 9 6
478 .pred.rel "mutex", p8, p9
479 ldf8 f35 = [up], 8 C 42 10 7
480 (p8) cmp.leu p6, p7 = r16, r26 C 43 10 7
481 (p9) cmp.ltu p6, p7 = r16, r26 C 44 10 7
482 getf.sig r25 = f37 C 45 13 11 7
483 st8 [r20] = r16, 8 C 46 11 7
484 br.cloop.dptk .Loop C 47 11 7
485 C *** MAIN LOOP END ***
487 .Le0:
488 .pred.rel "mutex", p6, p7
489 getf.sig r29 = f41 C
490 xma.l f36 = f32, f6, f44 C
491 (p6) add r14 = r30, r27, 1 C
492 ldf8 f47 = [rp], 8 C
493 xma.hu f40 = f32, f6, f44 C
494 (p7) add r14 = r30, r27 C
496 .pred.rel "mutex", p6, p7
497 (p6) cmp.leu p8, p9 = r14, r27 C
498 (p7) cmp.ltu p8, p9 = r14, r27 C
499 getf.sig r26 = f38 C
500 st8 [r20] = r14, 8 C
502 .pred.rel "mutex", p8, p9
503 getf.sig r30 = f42 C
504 xma.l f37 = f33, f6, f45 C
505 (p8) add r16 = r31, r24, 1 C
506 xma.hu f41 = f33, f6, f45 C
507 (p9) add r16 = r31, r24 C
509 .pred.rel "mutex", p8, p9
510 (p8) cmp.leu p6, p7 = r16, r24 C
511 (p9) cmp.ltu p6, p7 = r16, r24 C
512 getf.sig r27 = f39 C
513 st8 [r20] = r16, 8 C
515 .Lcj8:
516 .pred.rel "mutex", p6, p7
517 getf.sig r31 = f43 C
518 xma.l f38 = f34, f6, f46 C
519 (p6) add r14 = r28, r25, 1 C
520 xma.hu f42 = f34, f6, f46 C
521 (p7) add r14 = r28, r25 C
523 .pred.rel "mutex", p6, p7
524 (p6) cmp.leu p8, p9 = r14, r25 C
525 (p7) cmp.ltu p8, p9 = r14, r25 C
526 getf.sig r24 = f36 C
527 st8 [r20] = r14, 8 C
529 .Lcj7:
530 .pred.rel "mutex", p8, p9
531 getf.sig r28 = f40 C
532 xma.l f39 = f35, f6, f47 C
533 (p8) add r16 = r29, r26, 1 C
534 xma.hu f43 = f35, f6, f47 C
535 (p9) add r16 = r29, r26 C
537 .pred.rel "mutex", p8, p9
538 (p8) cmp.leu p6, p7 = r16, r26 C
539 (p9) cmp.ltu p6, p7 = r16, r26 C
540 getf.sig r25 = f37 C
541 st8 [r20] = r16, 8 C
543 .Lcj6:
544 .pred.rel "mutex", p6, p7
545 getf.sig r29 = f41 C
546 (p6) add r14 = r30, r27, 1 C
547 (p7) add r14 = r30, r27 C
549 .pred.rel "mutex", p6, p7
550 (p6) cmp.leu p8, p9 = r14, r27 C
551 (p7) cmp.ltu p8, p9 = r14, r27 C
552 getf.sig r26 = f38 C
553 st8 [r20] = r14, 8 C
555 .Lcj5:
556 .pred.rel "mutex", p8, p9
557 getf.sig r30 = f42 C
558 (p8) add r16 = r31, r24, 1 C
559 (p9) add r16 = r31, r24 C
561 .pred.rel "mutex", p8, p9
562 (p8) cmp.leu p6, p7 = r16, r24 C
563 (p9) cmp.ltu p6, p7 = r16, r24 C
564 getf.sig r27 = f39 C
565 st8 [r20] = r16, 8 C
567 .Lcj4:
568 .pred.rel "mutex", p6, p7
569 getf.sig r8 = f43 C
570 (p6) add r14 = r28, r25, 1 C
571 (p7) add r14 = r28, r25 C
573 .pred.rel "mutex", p6, p7
574 st8 [r20] = r14, 8 C
575 (p6) cmp.leu p8, p9 = r14, r25 C
576 (p7) cmp.ltu p8, p9 = r14, r25 C
578 .Lcj3:
579 .pred.rel "mutex", p8, p9
580 (p8) add r16 = r29, r26, 1 C
581 (p9) add r16 = r29, r26 C
583 .pred.rel "mutex", p8, p9
584 st8 [r20] = r16, 8 C
585 (p8) cmp.leu p6, p7 = r16, r26 C
586 (p9) cmp.ltu p6, p7 = r16, r26 C
588 .Lcj2:
589 .pred.rel "mutex", p6, p7
590 (p6) add r14 = r30, r27, 1 C
591 (p7) add r14 = r30, r27 C
593 .pred.rel "mutex", p6, p7
594 st8 [r20] = r14 C
595 (p6) cmp.leu p8, p9 = r14, r27 C
596 (p7) cmp.ltu p8, p9 = r14, r27 C
598 (p8) add r8 = 1, r8 C M I
599 mov.i ar.lc = r2 C I0
600 br.ret.sptk.many b0 C B
601 EPILOGUE()
602 ASM_END()