beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / ia64 / mul_1.asm
blob21bf6d0e14f3e99a413ead7ba40c80278f28da13
1 dnl IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and
2 dnl store the result in a second limb vector.
4 dnl Contributed to the GNU project by Torbjorn Granlund.
6 dnl Copyright 2000-2004, 2006, 2007 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
9 dnl
10 dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 dnl it under the terms of either:
12 dnl
13 dnl * the GNU Lesser General Public License as published by the Free
14 dnl Software Foundation; either version 3 of the License, or (at your
15 dnl option) any later version.
16 dnl
17 dnl or
18 dnl
19 dnl * the GNU General Public License as published by the Free Software
20 dnl Foundation; either version 2 of the License, or (at your option) any
21 dnl later version.
22 dnl
23 dnl or both in parallel, as here.
24 dnl
25 dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 dnl for more details.
29 dnl
30 dnl You should have received copies of the GNU General Public License and the
31 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 dnl see https://www.gnu.org/licenses/.
34 include(`../config.m4')
36 C cycles/limb
37 C Itanium: 4.0
38 C Itanium 2: 2.0
40 C TODO
41 C * Further optimize feed-in and wind-down code, both for speed and code size.
42 C * Handle low limb input and results specially, using a common stf8 in the
43 C epilogue.
44 C * Use 1 c/l carry propagation scheme in wind-down code.
45 C * Use extra pointer register for `up' to speed up feed-in loads.
46 C * Work out final differences with addmul_1.asm.
48 C INPUT PARAMETERS
49 define(`rp', `r32')
50 define(`up', `r33')
51 define(`n', `r34')
52 define(`vl', `r35')
53 define(`cy', `r36') C for mpn_mul_1c
55 ASM_START()
56 PROLOGUE(mpn_mul_1)
57 .prologue
58 .save ar.lc, r2
59 .body
61 ifdef(`HAVE_ABI_32',
62 ` addp4 rp = 0, rp C M I
63 addp4 up = 0, up C M I
64 zxt4 n = n C I
67 {.mfi
68 adds r15 = -1, n C M I
69 mov f9 = f0 C F
70 mov.i r2 = ar.lc C I0
72 {.mmi
73 ldf8 f7 = [up], 8 C M
74 nop.m 0 C M
75 and r14 = 3, n C M I
78 .Lcommon:
79 {.mii
80 setf.sig f6 = vl C M2 M3
81 shr.u r31 = r15, 2 C I0
82 cmp.eq p10, p0 = 0, r14 C M I
84 {.mii
85 cmp.eq p11, p0 = 2, r14 C M I
86 cmp.eq p12, p0 = 3, r14 C M I
87 nop.i 0 C I
90 {.mii
91 cmp.ne p6, p7 = r0, r0 C M I
92 mov.i ar.lc = r31 C I0
93 cmp.ne p8, p9 = r0, r0 C M I
95 {.bbb
96 (p10) br.dptk .Lb00 C B
97 (p11) br.dptk .Lb10 C B
98 (p12) br.dptk .Lb11 C B
102 .Lb01: mov r20 = 0
103 br.cloop.dptk .grt1 C B
105 xma.l f39 = f7, f6, f9 C F
106 xma.hu f43 = f7, f6, f9 C F
108 getf.sig r8 = f43 C M2
109 stf8 [rp] = f39 C M2 M3
110 mov.i ar.lc = r2 C I0
111 br.ret.sptk.many b0 C B
113 .grt1:
114 ldf8 f32 = [up], 8
116 ldf8 f33 = [up], 8
118 ldf8 f34 = [up], 8
119 xma.l f39 = f7, f6, f9
120 xma.hu f43 = f7, f6, f9
122 ldf8 f35 = [up], 8
123 br.cloop.dptk .grt5
125 xma.l f36 = f32, f6, f0
126 xma.hu f40 = f32, f6, f0
128 stf8 [rp] = f39, 8
129 xma.l f37 = f33, f6, f0
130 xma.hu f41 = f33, f6, f0
132 getf.sig r21 = f43
133 getf.sig r18 = f36
134 xma.l f38 = f34, f6, f0
135 xma.hu f42 = f34, f6, f0
137 getf.sig r22 = f40
138 getf.sig r19 = f37
139 xma.l f39 = f35, f6, f0
140 xma.hu f43 = f35, f6, f0
142 getf.sig r23 = f41
143 getf.sig r16 = f38
144 br .Lcj5
146 .grt5:
147 xma.l f36 = f32, f6, f0
148 xma.hu f40 = f32, f6, f0
150 getf.sig r17 = f39
151 ldf8 f32 = [up], 8
152 xma.l f37 = f33, f6, f0
153 xma.hu f41 = f33, f6, f0
155 getf.sig r21 = f43
156 ldf8 f33 = [up], 8
157 xma.l f38 = f34, f6, f0
159 getf.sig r18 = f36
160 xma.hu f42 = f34, f6, f0
162 getf.sig r22 = f40
163 ldf8 f34 = [up], 8
164 xma.l f39 = f35, f6, f0
166 getf.sig r19 = f37
167 xma.hu f43 = f35, f6, f0
168 br .LL01
171 .Lb10: ldf8 f35 = [up], 8
172 mov r23 = 0
173 br.cloop.dptk .grt2
175 xma.l f38 = f7, f6, f9
176 xma.hu f42 = f7, f6, f9
178 stf8 [rp] = f38, 8
179 xma.l f39 = f35, f6, f42
180 xma.hu f43 = f35, f6, f42
182 getf.sig r8 = f43
183 stf8 [rp] = f39
184 mov.i ar.lc = r2
185 br.ret.sptk.many b0
188 .grt2:
189 ldf8 f32 = [up], 8
191 ldf8 f33 = [up], 8
192 xma.l f38 = f7, f6, f9
193 xma.hu f42 = f7, f6, f9
195 ldf8 f34 = [up], 8
196 xma.l f39 = f35, f6, f0
197 xma.hu f43 = f35, f6, f0
199 ldf8 f35 = [up], 8
200 br.cloop.dptk .grt6
202 stf8 [rp] = f38, 8
203 xma.l f36 = f32, f6, f0
204 xma.hu f40 = f32, f6, f0
206 getf.sig r20 = f42
207 getf.sig r17 = f39
208 xma.l f37 = f33, f6, f0
209 xma.hu f41 = f33, f6, f0
211 getf.sig r21 = f43
212 getf.sig r18 = f36
213 xma.l f38 = f34, f6, f0
214 xma.hu f42 = f34, f6, f0
216 getf.sig r22 = f40
217 getf.sig r19 = f37
218 xma.l f39 = f35, f6, f0
219 xma.hu f43 = f35, f6, f0
220 br .Lcj6
222 .grt6:
223 getf.sig r16 = f38
224 xma.l f36 = f32, f6, f0
225 xma.hu f40 = f32, f6, f0
227 getf.sig r20 = f42
228 ldf8 f32 = [up], 8
229 xma.l f37 = f33, f6, f0
231 getf.sig r17 = f39
232 xma.hu f41 = f33, f6, f0
234 getf.sig r21 = f43
235 ldf8 f33 = [up], 8
236 xma.l f38 = f34, f6, f0
238 getf.sig r18 = f36
239 xma.hu f42 = f34, f6, f0
240 br .LL10
243 .Lb11: ldf8 f34 = [up], 8
244 mov r22 = 0
246 ldf8 f35 = [up], 8
247 br.cloop.dptk .grt3
250 xma.l f37 = f7, f6, f9
251 xma.hu f41 = f7, f6, f9
252 xma.l f38 = f34, f6, f0
253 xma.hu f42 = f34, f6, f0
254 xma.l f39 = f35, f6, f0
255 xma.hu f43 = f35, f6, f0
257 getf.sig r23 = f41
258 stf8 [rp] = f37, 8
259 getf.sig r16 = f38
260 getf.sig r20 = f42
261 getf.sig r17 = f39
262 getf.sig r8 = f43
263 br .Lcj3
265 .grt3:
266 ldf8 f32 = [up], 8
267 xma.l f37 = f7, f6, f9
268 xma.hu f41 = f7, f6, f9
270 ldf8 f33 = [up], 8
271 xma.l f38 = f34, f6, f0
272 xma.hu f42 = f34, f6, f0
274 getf.sig r19 = f37
275 ldf8 f34 = [up], 8
276 xma.l f39 = f35, f6, f0
277 xma.hu f43 = f35, f6, f0
279 getf.sig r23 = f41
280 ldf8 f35 = [up], 8
281 br.cloop.dptk .grt7
283 getf.sig r16 = f38
284 xma.l f36 = f32, f6, f0
285 getf.sig r20 = f42
286 xma.hu f40 = f32, f6, f0
288 getf.sig r17 = f39
289 xma.l f37 = f33, f6, f0
290 getf.sig r21 = f43
291 xma.hu f41 = f33, f6, f0
293 getf.sig r18 = f36
294 st8 [rp] = r19, 8
295 xma.l f38 = f34, f6, f0
296 xma.hu f42 = f34, f6, f0
297 br .Lcj7
299 .grt7:
300 getf.sig r16 = f38
301 xma.l f36 = f32, f6, f0
302 xma.hu f40 = f32, f6, f0
304 getf.sig r20 = f42
305 ldf8 f32 = [up], 8
306 xma.l f37 = f33, f6, f0
308 getf.sig r17 = f39
309 xma.hu f41 = f33, f6, f0
310 br .LL11
313 .Lb00: ldf8 f33 = [up], 8
314 mov r21 = 0
316 ldf8 f34 = [up], 8
318 ldf8 f35 = [up], 8
319 xma.l f36 = f7, f6, f9
320 xma.hu f40 = f7, f6, f9
321 br.cloop.dptk .grt4
323 xma.l f37 = f33, f6, f0
324 xma.hu f41 = f33, f6, f0
325 xma.l f38 = f34, f6, f0
326 xma.hu f42 = f34, f6, f0
328 getf.sig r22 = f40
329 stf8 [rp] = f36, 8
330 xma.l f39 = f35, f6, f0
331 getf.sig r19 = f37
332 xma.hu f43 = f35, f6, f0
334 getf.sig r23 = f41
335 getf.sig r16 = f38
336 getf.sig r20 = f42
337 getf.sig r17 = f39
338 br .Lcj4
340 .grt4:
341 ldf8 f32 = [up], 8
342 xma.l f37 = f33, f6, f0
343 xma.hu f41 = f33, f6, f0
345 getf.sig r18 = f36
346 ldf8 f33 = [up], 8
347 xma.l f38 = f34, f6, f0
348 xma.hu f42 = f34, f6, f0
350 getf.sig r22 = f40
351 ldf8 f34 = [up], 8
352 xma.l f39 = f35, f6, f0
354 getf.sig r19 = f37
355 getf.sig r23 = f41
356 xma.hu f43 = f35, f6, f0
357 ldf8 f35 = [up], 8
358 br.cloop.dptk .grt8
360 getf.sig r16 = f38
361 xma.l f36 = f32, f6, f0
362 getf.sig r20 = f42
363 xma.hu f40 = f32, f6, f0
365 getf.sig r17 = f39
366 st8 [rp] = r18, 8
367 xma.l f37 = f33, f6, f0
368 xma.hu f41 = f33, f6, f0
369 br .Lcj8
371 .grt8:
372 getf.sig r16 = f38
373 xma.l f36 = f32, f6, f0
374 xma.hu f40 = f32, f6, f0
375 br .LL00
378 C *** MAIN LOOP START ***
379 ALIGN(32)
380 .Loop:
381 .pred.rel "mutex",p6,p7
382 getf.sig r16 = f38
383 xma.l f36 = f32, f6, f0
384 (p6) cmp.leu p8, p9 = r24, r17
385 st8 [rp] = r24, 8
386 xma.hu f40 = f32, f6, f0
387 (p7) cmp.ltu p8, p9 = r24, r17
389 .LL00:
390 .pred.rel "mutex",p8,p9
391 getf.sig r20 = f42
392 (p8) add r24 = r18, r21, 1
393 nop.b 0
394 ldf8 f32 = [up], 8
395 (p9) add r24 = r18, r21
396 nop.b 0
398 .pred.rel "mutex",p8,p9
399 getf.sig r17 = f39
400 xma.l f37 = f33, f6, f0
401 (p8) cmp.leu p6, p7 = r24, r18
402 st8 [rp] = r24, 8
403 xma.hu f41 = f33, f6, f0
404 (p9) cmp.ltu p6, p7 = r24, r18
406 .LL11:
407 .pred.rel "mutex",p6,p7
408 getf.sig r21 = f43
409 (p6) add r24 = r19, r22, 1
410 nop.b 0
411 ldf8 f33 = [up], 8
412 (p7) add r24 = r19, r22
413 nop.b 0
415 .pred.rel "mutex",p6,p7
416 getf.sig r18 = f36
417 xma.l f38 = f34, f6, f0
418 (p6) cmp.leu p8, p9 = r24, r19
419 st8 [rp] = r24, 8
420 xma.hu f42 = f34, f6, f0
421 (p7) cmp.ltu p8, p9 = r24, r19
423 .LL10:
424 .pred.rel "mutex",p8,p9
425 getf.sig r22 = f40
426 (p8) add r24 = r16, r23, 1
427 nop.b 0
428 ldf8 f34 = [up], 8
429 (p9) add r24 = r16, r23
430 nop.b 0
432 .pred.rel "mutex",p8,p9
433 getf.sig r19 = f37
434 xma.l f39 = f35, f6, f0
435 (p8) cmp.leu p6, p7 = r24, r16
436 st8 [rp] = r24, 8
437 xma.hu f43 = f35, f6, f0
438 (p9) cmp.ltu p6, p7 = r24, r16
440 .LL01:
441 .pred.rel "mutex",p6,p7
442 getf.sig r23 = f41
443 (p6) add r24 = r17, r20, 1
444 nop.b 0
445 ldf8 f35 = [up], 8
446 (p7) add r24 = r17, r20
447 br.cloop.dptk .Loop
448 C *** MAIN LOOP END ***
451 .Lcj9:
452 .pred.rel "mutex",p6,p7
453 getf.sig r16 = f38
454 xma.l f36 = f32, f6, f0
455 (p6) cmp.leu p8, p9 = r24, r17
456 st8 [rp] = r24, 8
457 xma.hu f40 = f32, f6, f0
458 (p7) cmp.ltu p8, p9 = r24, r17
460 .pred.rel "mutex",p8,p9
461 getf.sig r20 = f42
462 (p8) add r24 = r18, r21, 1
463 (p9) add r24 = r18, r21
465 .pred.rel "mutex",p8,p9
466 getf.sig r17 = f39
467 xma.l f37 = f33, f6, f0
468 (p8) cmp.leu p6, p7 = r24, r18
469 st8 [rp] = r24, 8
470 xma.hu f41 = f33, f6, f0
471 (p9) cmp.ltu p6, p7 = r24, r18
473 .Lcj8:
474 .pred.rel "mutex",p6,p7
475 getf.sig r21 = f43
476 (p6) add r24 = r19, r22, 1
477 (p7) add r24 = r19, r22
479 .pred.rel "mutex",p6,p7
480 getf.sig r18 = f36
481 xma.l f38 = f34, f6, f0
482 (p6) cmp.leu p8, p9 = r24, r19
483 st8 [rp] = r24, 8
484 xma.hu f42 = f34, f6, f0
485 (p7) cmp.ltu p8, p9 = r24, r19
487 .Lcj7:
488 .pred.rel "mutex",p8,p9
489 getf.sig r22 = f40
490 (p8) add r24 = r16, r23, 1
491 (p9) add r24 = r16, r23
493 .pred.rel "mutex",p8,p9
494 getf.sig r19 = f37
495 xma.l f39 = f35, f6, f0
496 (p8) cmp.leu p6, p7 = r24, r16
497 st8 [rp] = r24, 8
498 xma.hu f43 = f35, f6, f0
499 (p9) cmp.ltu p6, p7 = r24, r16
501 .Lcj6:
502 .pred.rel "mutex",p6,p7
503 getf.sig r23 = f41
504 (p6) add r24 = r17, r20, 1
505 (p7) add r24 = r17, r20
507 .pred.rel "mutex",p6,p7
508 (p6) cmp.leu p8, p9 = r24, r17
509 (p7) cmp.ltu p8, p9 = r24, r17
510 getf.sig r16 = f38
511 st8 [rp] = r24, 8
513 .Lcj5:
514 .pred.rel "mutex",p8,p9
515 getf.sig r20 = f42
516 (p8) add r24 = r18, r21, 1
517 (p9) add r24 = r18, r21
519 .pred.rel "mutex",p8,p9
520 (p8) cmp.leu p6, p7 = r24, r18
521 (p9) cmp.ltu p6, p7 = r24, r18
522 getf.sig r17 = f39
523 st8 [rp] = r24, 8
525 .Lcj4:
526 .pred.rel "mutex",p6,p7
527 getf.sig r8 = f43
528 (p6) add r24 = r19, r22, 1
529 (p7) add r24 = r19, r22
531 .pred.rel "mutex",p6,p7
532 st8 [rp] = r24, 8
533 (p6) cmp.leu p8, p9 = r24, r19
534 (p7) cmp.ltu p8, p9 = r24, r19
536 .Lcj3:
537 .pred.rel "mutex",p8,p9
538 (p8) add r24 = r16, r23, 1
539 (p9) add r24 = r16, r23
541 .pred.rel "mutex",p8,p9
542 st8 [rp] = r24, 8
543 (p8) cmp.leu p6, p7 = r24, r16
544 (p9) cmp.ltu p6, p7 = r24, r16
546 .Lcj2:
547 .pred.rel "mutex",p6,p7
548 (p6) add r24 = r17, r20, 1
549 (p7) add r24 = r17, r20
551 .pred.rel "mutex",p6,p7
552 st8 [rp] = r24, 8
553 (p6) cmp.leu p8, p9 = r24, r17
554 (p7) cmp.ltu p8, p9 = r24, r17
556 (p8) add r8 = 1, r8
557 mov.i ar.lc = r2
558 br.ret.sptk.many b0
559 EPILOGUE()
561 PROLOGUE(mpn_mul_1c)
562 .prologue
563 .save ar.lc, r2
564 .body
566 ifdef(`HAVE_ABI_32',
567 ` addp4 rp = 0, rp C M I
568 addp4 up = 0, up C M I
569 zxt4 n = n C I
572 {.mmi
573 adds r15 = -1, n C M I
574 setf.sig f9 = cy C M2 M3
575 mov.i r2 = ar.lc C I0
577 {.mmb
578 ldf8 f7 = [up], 8 C M
579 and r14 = 3, n C M I
580 br.sptk .Lcommon
583 EPILOGUE()
584 ASM_END()