beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / powerpc64 / mode64 / p6 / mul_basecase.asm
blob3d32b46c353b50fd6acd33ba19a8a37babb991da
1 dnl PowerPC-64 mpn_mul_basecase.
3 dnl Copyright 1999-2001, 2003-2006, 2008, 2010 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C POWER3/PPC630 ?
35 C POWER4/PPC970 ?
36 C POWER5 ?
37 C POWER6 12.25
39 C TODO
40 C * Reduce register usage. At least 4 register less can be used.
41 C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling
42 C would bring us to 9 c/l.
43 C * The bdz insns for b1 and b2 will never branch,
44 C * Align things better, perhaps by moving things like pointer updates from
45 C before to after loops.
47 C INPUT PARAMETERS
48 define(`rp', `r3')
49 define(`up', `r4')
50 define(`un', `r5')
51 define(`vp', `r6')
52 define(`vn', `r7')
54 define(`v0', `r25')
55 define(`outer_rp', `r22')
56 define(`outer_up', `r23')
58 ASM_START()
59 PROLOGUE(mpn_mul_basecase)
61 C Special code for un <= 2, for efficiency of these important cases,
62 C and since it simplifies the default code.
63 cmpdi cr0, un, 2
64 bgt cr0, L(un_gt2)
65 cmpdi cr6, vn, 1
66 ld r7, 0(vp)
67 ld r5, 0(up)
68 mulld r8, r5, r7 C weight 0
69 mulhdu r9, r5, r7 C weight 1
70 std r8, 0(rp)
71 beq cr0, L(2x)
72 std r9, 8(rp)
73 blr
74 ALIGN(16)
75 L(2x): ld r0, 8(up)
76 mulld r8, r0, r7 C weight 1
77 mulhdu r10, r0, r7 C weight 2
78 addc r9, r9, r8
79 addze r10, r10
80 bne cr6, L(2x2)
81 std r9, 8(rp)
82 std r10, 16(rp)
83 blr
84 ALIGN(16)
85 L(2x2): ld r6, 8(vp)
86 nop
87 mulld r8, r5, r6 C weight 1
88 mulhdu r11, r5, r6 C weight 2
89 mulld r12, r0, r6 C weight 2
90 mulhdu r0, r0, r6 C weight 3
91 addc r9, r9, r8
92 std r9, 8(rp)
93 adde r11, r11, r10
94 addze r0, r0
95 addc r11, r11, r12
96 addze r0, r0
97 std r11, 16(rp)
98 std r0, 24(rp)
99 blr
101 L(un_gt2):
102 std r31, -8(r1)
103 std r30, -16(r1)
104 std r29, -24(r1)
105 std r28, -32(r1)
106 std r27, -40(r1)
107 std r26, -48(r1)
108 std r25, -56(r1)
109 std r24, -64(r1)
110 std r23, -72(r1)
111 std r22, -80(r1)
112 std r21, -88(r1)
113 std r20, -96(r1)
115 mr outer_rp, rp
116 mr outer_up, up
118 ld v0, 0(vp) C new v limb
119 addi vp, vp, 8
120 ld r26, 0(up)
122 rldicl. r0, un, 0,62 C r0 = n & 3, set cr0
123 cmpdi cr6, r0, 2
124 addi un, un, 4 C compute count...
125 srdi un, un, 2 C ...for ctr
126 mtctr un C copy inner loop count into ctr
127 beq cr0, L(b0)
128 blt cr6, L(b1)
129 beq cr6, L(b2)
132 ALIGN(16)
133 L(b3):
134 ld r27, 8(up)
135 ld r20, 16(up)
136 mulld r0, r26, v0
137 mulhdu r31, r26, v0
138 mulld r24, r27, v0
139 mulhdu r8, r27, v0
140 mulld r9, r20, v0
141 mulhdu r10, r20, v0
142 addc r24, r24, r31
143 adde r9, r9, r8
144 addze r12, r10
145 std r0, 0(rp)
146 std r24, 8(rp)
147 std r9, 16(rp)
148 addi up, up, 16
149 addi rp, rp, 16
150 bdz L(end_m_3)
152 ALIGN(32)
153 L(lo_m_3):
154 ld r26, 8(up)
155 ld r27, 16(up)
156 ld r20, 24(up)
157 ld r21, 32(up)
158 mulld r0, r26, v0
159 mulhdu r31, r26, v0
160 mulld r24, r27, v0
161 mulhdu r8, r27, v0
162 mulld r9, r20, v0
163 mulhdu r27, r20, v0
164 mulld r11, r21, v0
165 mulhdu r26, r21, v0
166 adde r0, r0, r12
167 adde r24, r24, r31
168 std r0, 8(rp)
169 adde r9, r9, r8
170 std r24, 16(rp)
171 adde r11, r11, r27
172 std r9, 24(rp)
173 addi up, up, 32
174 std r11, 32(rp)
175 addi rp, rp, 32
176 mr r12, r26
177 bdnz L(lo_m_3)
179 ALIGN(16)
180 L(end_m_3):
181 addze r12, r12
182 addic. vn, vn, -1
183 std r12, 8(rp)
184 beq L(ret)
186 ALIGN(16)
187 L(outer_lo_3):
188 mtctr un C copy inner loop count into ctr
189 addi rp, outer_rp, 24
190 addi up, outer_up, 16
191 addi outer_rp, outer_rp, 8
192 ld v0, 0(vp) C new v limb
193 addi vp, vp, 8
194 ld r26, -16(up)
195 ld r27, -8(up)
196 ld r20, 0(up)
197 mulld r0, r26, v0
198 mulhdu r31, r26, v0
199 mulld r24, r27, v0
200 mulhdu r8, r27, v0
201 mulld r9, r20, v0
202 mulhdu r10, r20, v0
203 ld r28, -16(rp)
204 ld r29, -8(rp)
205 ld r30, 0(rp)
206 addc r24, r24, r31
207 adde r9, r9, r8
208 addze r12, r10
209 addc r0, r0, r28
210 std r0, -16(rp)
211 adde r24, r24, r29
212 std r24, -8(rp)
213 adde r9, r9, r30
214 std r9, 0(rp)
215 bdz L(end_3)
217 ALIGN(32) C registers dying
218 L(lo_3):
219 ld r26, 8(up)
220 ld r27, 16(up)
221 ld r20, 24(up) C
222 ld r21, 32(up) C
223 addi up, up, 32 C
224 addi rp, rp, 32 C
225 mulld r0, r26, v0 C
226 mulhdu r10, r26, v0 C 26
227 mulld r24, r27, v0 C
228 mulhdu r8, r27, v0 C 27
229 mulld r9, r20, v0 C
230 mulhdu r27, r20, v0 C 26
231 mulld r11, r21, v0 C
232 mulhdu r26, r21, v0 C 27
233 ld r28, -24(rp) C
234 adde r0, r0, r12 C 0 12
235 ld r29, -16(rp) C
236 adde r24, r24, r10 C 24 10
237 ld r30, -8(rp) C
238 ld r31, 0(rp) C
239 adde r9, r9, r8 C 8 9
240 adde r11, r11, r27 C 27 11
241 addze r12, r26 C 26
242 addc r0, r0, r28 C 0 28
243 std r0, -24(rp) C 0
244 adde r24, r24, r29 C 7 29
245 std r24, -16(rp) C 7
246 adde r9, r9, r30 C 9 30
247 std r9, -8(rp) C 9
248 adde r11, r11, r31 C 11 31
249 std r11, 0(rp) C 11
250 bdnz L(lo_3) C
252 ALIGN(16)
253 L(end_3):
254 addze r12, r12
255 addic. vn, vn, -1
256 std r12, 8(rp)
257 bne L(outer_lo_3)
258 b L(ret)
261 ALIGN(16)
262 L(b1):
263 mulld r0, r26, v0
264 mulhdu r12, r26, v0
265 addic r0, r0, 0
266 std r0, 0(rp)
267 bdz L(end_m_1)
269 ALIGN(16)
270 L(lo_m_1):
271 ld r26, 8(up)
272 ld r27, 16(up)
273 ld r20, 24(up)
274 ld r21, 32(up)
275 mulld r0, r26, v0
276 mulhdu r31, r26, v0
277 mulld r24, r27, v0
278 mulhdu r8, r27, v0
279 mulld r9, r20, v0
280 mulhdu r27, r20, v0
281 mulld r11, r21, v0
282 mulhdu r26, r21, v0
283 adde r0, r0, r12
284 adde r24, r24, r31
285 std r0, 8(rp)
286 adde r9, r9, r8
287 std r24, 16(rp)
288 adde r11, r11, r27
289 std r9, 24(rp)
290 addi up, up, 32
291 std r11, 32(rp)
292 addi rp, rp, 32
293 mr r12, r26
294 bdnz L(lo_m_1)
296 ALIGN(16)
297 L(end_m_1):
298 addze r12, r12
299 addic. vn, vn, -1
300 std r12, 8(rp)
301 beq L(ret)
303 ALIGN(16)
304 L(outer_lo_1):
305 mtctr un C copy inner loop count into ctr
306 addi rp, outer_rp, 8
307 mr up, outer_up
308 addi outer_rp, outer_rp, 8
309 ld v0, 0(vp) C new v limb
310 addi vp, vp, 8
311 ld r26, 0(up)
312 ld r28, 0(rp)
313 mulld r0, r26, v0
314 mulhdu r12, r26, v0
315 addc r0, r0, r28
316 std r0, 0(rp)
317 bdz L(end_1)
319 ALIGN(32) C registers dying
320 L(lo_1):
321 ld r26, 8(up)
322 ld r27, 16(up)
323 ld r20, 24(up) C
324 ld r21, 32(up) C
325 addi up, up, 32 C
326 addi rp, rp, 32 C
327 mulld r0, r26, v0 C
328 mulhdu r10, r26, v0 C 26
329 mulld r24, r27, v0 C
330 mulhdu r8, r27, v0 C 27
331 mulld r9, r20, v0 C
332 mulhdu r27, r20, v0 C 26
333 mulld r11, r21, v0 C
334 mulhdu r26, r21, v0 C 27
335 ld r28, -24(rp) C
336 adde r0, r0, r12 C 0 12
337 ld r29, -16(rp) C
338 adde r24, r24, r10 C 24 10
339 ld r30, -8(rp) C
340 ld r31, 0(rp) C
341 adde r9, r9, r8 C 8 9
342 adde r11, r11, r27 C 27 11
343 addze r12, r26 C 26
344 addc r0, r0, r28 C 0 28
345 std r0, -24(rp) C 0
346 adde r24, r24, r29 C 7 29
347 std r24, -16(rp) C 7
348 adde r9, r9, r30 C 9 30
349 std r9, -8(rp) C 9
350 adde r11, r11, r31 C 11 31
351 std r11, 0(rp) C 11
352 bdnz L(lo_1) C
354 ALIGN(16)
355 L(end_1):
356 addze r12, r12
357 addic. vn, vn, -1
358 std r12, 8(rp)
359 bne L(outer_lo_1)
360 b L(ret)
363 ALIGN(16)
364 L(b0):
365 addi up, up, -8
366 addi rp, rp, -8
367 li r12, 0
368 addic r12, r12, 0
369 bdz L(end_m_0)
371 ALIGN(16)
372 L(lo_m_0):
373 ld r26, 8(up)
374 ld r27, 16(up)
375 ld r20, 24(up)
376 ld r21, 32(up)
377 mulld r0, r26, v0
378 mulhdu r31, r26, v0
379 mulld r24, r27, v0
380 mulhdu r8, r27, v0
381 mulld r9, r20, v0
382 mulhdu r27, r20, v0
383 mulld r11, r21, v0
384 mulhdu r26, r21, v0
385 adde r0, r0, r12
386 adde r24, r24, r31
387 std r0, 8(rp)
388 adde r9, r9, r8
389 std r24, 16(rp)
390 adde r11, r11, r27
391 std r9, 24(rp)
392 addi up, up, 32
393 std r11, 32(rp)
394 addi rp, rp, 32
395 mr r12, r26
396 bdnz L(lo_m_0)
398 ALIGN(16)
399 L(end_m_0):
400 addze r12, r12
401 addic. vn, vn, -1
402 std r12, 8(rp)
403 beq L(ret)
405 ALIGN(16)
406 L(outer_lo_0):
407 mtctr un C copy inner loop count into ctr
408 addi rp, outer_rp, 0
409 addi up, outer_up, -8
410 addi outer_rp, outer_rp, 8
411 ld v0, 0(vp) C new v limb
412 addi vp, vp, 8
413 li r12, 0
414 addic r12, r12, 0
415 bdz L(end_0)
417 ALIGN(32) C registers dying
418 L(lo_0):
419 ld r26, 8(up)
420 ld r27, 16(up)
421 ld r20, 24(up) C
422 ld r21, 32(up) C
423 addi up, up, 32 C
424 addi rp, rp, 32 C
425 mulld r0, r26, v0 C
426 mulhdu r10, r26, v0 C 26
427 mulld r24, r27, v0 C
428 mulhdu r8, r27, v0 C 27
429 mulld r9, r20, v0 C
430 mulhdu r27, r20, v0 C 26
431 mulld r11, r21, v0 C
432 mulhdu r26, r21, v0 C 27
433 ld r28, -24(rp) C
434 adde r0, r0, r12 C 0 12
435 ld r29, -16(rp) C
436 adde r24, r24, r10 C 24 10
437 ld r30, -8(rp) C
438 ld r31, 0(rp) C
439 adde r9, r9, r8 C 8 9
440 adde r11, r11, r27 C 27 11
441 addze r12, r26 C 26
442 addc r0, r0, r28 C 0 28
443 std r0, -24(rp) C 0
444 adde r24, r24, r29 C 7 29
445 std r24, -16(rp) C 7
446 adde r9, r9, r30 C 9 30
447 std r9, -8(rp) C 9
448 adde r11, r11, r31 C 11 31
449 std r11, 0(rp) C 11
450 bdnz L(lo_0) C
452 ALIGN(16)
453 L(end_0):
454 addze r12, r12
455 addic. vn, vn, -1
456 std r12, 8(rp)
457 bne L(outer_lo_0)
458 b L(ret)
461 ALIGN(16)
462 L(b2): ld r27, 8(up)
463 addi up, up, 8
464 mulld r0, r26, v0
465 mulhdu r10, r26, v0
466 mulld r24, r27, v0
467 mulhdu r8, r27, v0
468 addc r24, r24, r10
469 addze r12, r8
470 std r0, 0(rp)
471 std r24, 8(rp)
472 addi rp, rp, 8
473 bdz L(end_m_2)
475 ALIGN(16)
476 L(lo_m_2):
477 ld r26, 8(up)
478 ld r27, 16(up)
479 ld r20, 24(up)
480 ld r21, 32(up)
481 mulld r0, r26, v0
482 mulhdu r31, r26, v0
483 mulld r24, r27, v0
484 mulhdu r8, r27, v0
485 mulld r9, r20, v0
486 mulhdu r27, r20, v0
487 mulld r11, r21, v0
488 mulhdu r26, r21, v0
489 adde r0, r0, r12
490 adde r24, r24, r31
491 std r0, 8(rp)
492 adde r9, r9, r8
493 std r24, 16(rp)
494 adde r11, r11, r27
495 std r9, 24(rp)
496 addi up, up, 32
497 std r11, 32(rp)
498 addi rp, rp, 32
499 mr r12, r26
500 bdnz L(lo_m_2)
502 ALIGN(16)
503 L(end_m_2):
504 addze r12, r12
505 addic. vn, vn, -1
506 std r12, 8(rp)
507 beq L(ret)
509 ALIGN(16)
510 L(outer_lo_2):
511 mtctr un C copy inner loop count into ctr
512 addi rp, outer_rp, 16
513 addi up, outer_up, 8
514 addi outer_rp, outer_rp, 8
515 ld v0, 0(vp) C new v limb
516 addi vp, vp, 8
517 ld r26, -8(up)
518 ld r27, 0(up)
519 ld r28, -8(rp)
520 ld r29, 0(rp)
521 mulld r0, r26, v0
522 mulhdu r10, r26, v0
523 mulld r24, r27, v0
524 mulhdu r8, r27, v0
525 addc r24, r24, r10
526 addze r12, r8
527 addc r0, r0, r28
528 std r0, -8(rp)
529 adde r24, r24, r29
530 std r24, 0(rp)
531 bdz L(end_2)
533 ALIGN(16) C registers dying
534 L(lo_2):
535 ld r26, 8(up)
536 ld r27, 16(up)
537 ld r20, 24(up) C
538 ld r21, 32(up) C
539 addi up, up, 32 C
540 addi rp, rp, 32 C
541 mulld r0, r26, v0 C
542 mulhdu r10, r26, v0 C 26
543 mulld r24, r27, v0 C
544 mulhdu r8, r27, v0 C 27
545 mulld r9, r20, v0 C
546 mulhdu r27, r20, v0 C 26
547 mulld r11, r21, v0 C
548 mulhdu r26, r21, v0 C 27
549 ld r28, -24(rp) C
550 adde r0, r0, r12 C 0 12
551 ld r29, -16(rp) C
552 adde r24, r24, r10 C 24 10
553 ld r30, -8(rp) C
554 ld r31, 0(rp) C
555 adde r9, r9, r8 C 8 9
556 adde r11, r11, r27 C 27 11
557 addze r12, r26 C 26
558 addc r0, r0, r28 C 0 28
559 std r0, -24(rp) C 0
560 adde r24, r24, r29 C 7 29
561 std r24, -16(rp) C 7
562 adde r9, r9, r30 C 9 30
563 std r9, -8(rp) C 9
564 adde r11, r11, r31 C 11 31
565 std r11, 0(rp) C 11
566 bdnz L(lo_2) C
568 ALIGN(16)
569 L(end_2):
570 addze r12, r12
571 addic. vn, vn, -1
572 std r12, 8(rp)
573 bne L(outer_lo_2)
574 C b L(ret)
576 L(ret): ld r31, -8(r1)
577 ld r30, -16(r1)
578 ld r29, -24(r1)
579 ld r28, -32(r1)
580 ld r27, -40(r1)
581 ld r26, -48(r1)
582 ld r25, -56(r1)
583 ld r24, -64(r1)
584 ld r23, -72(r1)
585 ld r22, -80(r1)
586 ld r21, -88(r1)
587 ld r20, -96(r1)
589 EPILOGUE()