beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / core2 / mul_basecase.asm
blobd16be852f76ff785ef210fcbf4cad713bc065db1
1 dnl X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere.
2 dnl It also seems good for Conroe/Wolfdale.
4 dnl Contributed to the GNU project by Torbjörn Granlund.
6 dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
9 dnl
10 dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 dnl it under the terms of either:
12 dnl
13 dnl * the GNU Lesser General Public License as published by the Free
14 dnl Software Foundation; either version 3 of the License, or (at your
15 dnl option) any later version.
16 dnl
17 dnl or
18 dnl
19 dnl * the GNU General Public License as published by the Free Software
20 dnl Foundation; either version 2 of the License, or (at your option) any
21 dnl later version.
22 dnl
23 dnl or both in parallel, as here.
24 dnl
25 dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 dnl for more details.
29 dnl
30 dnl You should have received copies of the GNU General Public License and the
31 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 dnl see https://www.gnu.org/licenses/.
34 include(`../config.m4')
36 C cycles/limb mul_1 mul_2 mul_3 addmul_2
37 C AMD K8,K9
38 C AMD K10
39 C AMD bull
40 C AMD pile
41 C AMD steam
42 C AMD bobcat
43 C AMD jaguar
44 C Intel P4
45 C Intel core 4.0 4.0 - 4.18-4.25
46 C Intel NHM 3.75 3.8 - 4.06-4.2
47 C Intel SBR
48 C Intel IBR
49 C Intel HWL
50 C Intel BWL
51 C Intel atom
52 C VIA nano
54 C The inner loops of this code are the result of running a code generation and
55 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
57 C Code structure:
60 C m_1(0m4) m_1(1m4) m_1(2m4) m_1(3m4)
61 C | | | |
62 C m_2(0m4) | m_2(1m4) | m_2(2m4) | m_2(3m4) |
63 C | / | / | / | /
64 C | / | / | / | /
65 C | / | / | / | /
66 C \|/ |/_ \|/ |/_ \|/ |/_ \|/ |/_
67 C _____ _____ _____ _____
68 C / \ / \ / \ / \
69 C \|/ | \|/ | \|/ | \|/ |
70 C am_2(0m4) | am_2(1m4) | am_2(2m4) | am_2(3m4) |
71 C \ /|\ \ /|\ \ /|\ \ /|\
72 C \_____/ \_____/ \_____/ \_____/
74 C TODO
75 C * Tune. None done so far.
76 C * Currently 2687 bytes, making it smaller would be nice.
77 C * Implement some basecases, say for un < 4.
78 C * Try zeroing with xor in m2 loops.
79 C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
80 C between loop header and wind-down code.
81 C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte.
83 C When playing with pointers, set this to $2 to fall back to conservative
84 C indexing in wind-down code.
85 define(`I',`$1')
87 C Define this to $1 to use late loop index variable as zero, $2 to use an
88 C explicit $0.
89 define(`Z',`$1')
91 define(`rp', `%rdi')
92 define(`up', `%rsi')
93 define(`un_param', `%rdx')
94 define(`vp_param', `%rcx') C FIXME reallocate vp to rcx but watch performance!
95 define(`vn_param', `%r8')
97 define(`un', `%r9')
98 define(`vn', `(%rsp)')
100 define(`v0', `%r10')
101 define(`v1', `%r11')
102 define(`w0', `%rbx')
103 define(`w1', `%rcx')
104 define(`w2', `%rbp')
105 define(`w3', `%r12')
106 define(`i', `%r13')
107 define(`vp', `%r14')
109 define(`X0', `%r8')
110 define(`X1', `%r15')
112 C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
114 ABI_SUPPORT(DOS64)
115 ABI_SUPPORT(STD64)
117 define(`ALIGNx', `ALIGN(16)')
119 define(`N', 85)
120 ifdef(`N',,`define(`N',0)')
121 define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
123 ASM_START()
124 TEXT
125 ALIGN(32)
126 PROLOGUE(mpn_mul_basecase)
127 FUNC_ENTRY(4)
128 IFDOS(` mov 56(%rsp), %r8d ')
129 mov (up), %rax C shared for mul_1 and mul_2
130 push %rbx
131 push %rbp
132 push %r12
133 push %r13
134 push %r14
136 mov (vp_param), v0 C shared for mul_1 and mul_2
138 xor un, un
139 sub un_param, un C un = -un_param
141 lea (up,un_param,8), up
142 lea (rp,un_param,8), rp
144 mul v0 C shared for mul_1 and mul_2
146 test $1, R8(vn_param)
147 jz L(m2)
149 lea 8(vp_param), vp C FIXME: delay until known needed
151 test $1, R8(un)
152 jnz L(m1x1)
154 L(m1x0):test $2, R8(un)
155 jnz L(m1s2)
157 L(m1s0):
158 lea (un), i
159 mov %rax, (rp,un,8)
160 mov 8(up,un,8), %rax
161 mov %rdx, w0 C FIXME: Use lea?
162 lea L(do_am0)(%rip), %rbp
163 jmp L(m1e0)
165 L(m1s2):
166 lea 2(un), i
167 mov %rax, (rp,un,8)
168 mov 8(up,un,8), %rax
169 mov %rdx, w0 C FIXME: Use lea?
170 mul v0
171 lea L(do_am2)(%rip), %rbp
172 test i, i
173 jnz L(m1e2)
174 add %rax, w0
175 adc $0, %rdx
176 mov w0, I(-8(rp),8(rp,un,8))
177 mov %rdx, I((rp),16(rp,un,8))
178 jmp L(ret2)
180 L(m1x1):test $2, R8(un)
181 jz L(m1s3)
183 L(m1s1):
184 lea 1(un), i
185 mov %rax, (rp,un,8)
186 test i, i
187 jz L(1)
188 mov 8(up,un,8), %rax
189 mov %rdx, w1 C FIXME: Use lea?
190 lea L(do_am1)(%rip), %rbp
191 jmp L(m1e1)
192 L(1): mov %rdx, I((rp),8(rp,un,8))
193 jmp L(ret2)
195 L(m1s3):
196 lea -1(un), i
197 mov %rax, (rp,un,8)
198 mov 8(up,un,8), %rax
199 mov %rdx, w1 C FIXME: Use lea?
200 lea L(do_am3)(%rip), %rbp
201 jmp L(m1e3)
203 ALIGNx
204 L(m1top):
205 mul v0
206 mov w1, -16(rp,i,8)
207 L(m1e2):xor R32(w1), R32(w1)
208 add %rax, w0
209 mov (up,i,8), %rax
210 adc %rdx, w1
211 mov w0, -8(rp,i,8)
212 L(m1e1):xor R32(w0), R32(w0)
213 mul v0
214 add %rax, w1
215 mov 8(up,i,8), %rax
216 adc %rdx, w0
217 mov w1, (rp,i,8)
218 L(m1e0):xor R32(w1), R32(w1)
219 mul v0
220 add %rax, w0
221 mov 16(up,i,8), %rax
222 adc %rdx, w1
223 mov w0, 8(rp,i,8)
224 L(m1e3):xor R32(w0), R32(w0)
225 mul v0
226 add %rax, w1
227 mov 24(up,i,8), %rax
228 adc %rdx, w0
229 add $4, i
230 js L(m1top)
232 mul v0
233 mov w1, I(-16(rp),-16(rp,i,8))
234 add %rax, w0
235 adc $0, %rdx
236 mov w0, I(-8(rp),-8(rp,i,8))
237 mov %rdx, I((rp),(rp,i,8))
239 dec vn_param
240 jz L(ret2)
241 lea -8(rp), rp
242 jmp *%rbp
244 L(m2):
245 mov 8(vp_param), v1
246 lea 16(vp_param), vp C FIXME: delay until known needed
248 test $1, R8(un)
249 jnz L(bx1)
251 L(bx0): test $2, R8(un)
252 jnz L(b10)
254 L(b00): lea (un), i
255 mov %rax, (rp,un,8)
256 mov %rdx, w1 C FIXME: Use lea?
257 mov (up,un,8), %rax
258 mov $0, R32(w2)
259 jmp L(m2e0)
261 L(b10): lea -2(un), i
262 mov %rax, w2 C FIXME: Use lea?
263 mov (up,un,8), %rax
264 mov %rdx, w3 C FIXME: Use lea?
265 mov $0, R32(w0)
266 jmp L(m2e2)
268 L(bx1): test $2, R8(un)
269 jz L(b11)
271 L(b01): lea 1(un), i
272 mov %rax, (rp,un,8)
273 mov (up,un,8), %rax
274 mov %rdx, w0 C FIXME: Use lea?
275 mov $0, R32(w1)
276 jmp L(m2e1)
278 L(b11): lea -1(un), i
279 mov %rax, w1 C FIXME: Use lea?
280 mov (up,un,8), %rax
281 mov %rdx, w2 C FIXME: Use lea?
282 mov $0, R32(w3)
283 jmp L(m2e3)
285 ALIGNx
286 L(m2top0):
287 mul v0
288 add %rax, w3
289 mov -8(up,i,8), %rax
290 mov w3, -8(rp,i,8)
291 adc %rdx, w0
292 adc $0, R32(w1)
293 mul v1
294 add %rax, w0
295 adc %rdx, w1
296 mov $0, R32(w2)
297 mov (up,i,8), %rax
298 mul v0
299 add %rax, w0
300 mov w0, (rp,i,8)
301 adc %rdx, w1
302 mov (up,i,8), %rax
303 adc $0, R32(w2)
304 L(m2e0):mul v1
305 add %rax, w1
306 adc %rdx, w2
307 mov 8(up,i,8), %rax
308 mul v0
309 mov $0, R32(w3)
310 add %rax, w1
311 adc %rdx, w2
312 adc $0, R32(w3)
313 mov 8(up,i,8), %rax
314 mul v1
315 add %rax, w2
316 mov w1, 8(rp,i,8)
317 adc %rdx, w3
318 mov $0, R32(w0)
319 mov 16(up,i,8), %rax
320 mul v0
321 add %rax, w2
322 mov 16(up,i,8), %rax
323 adc %rdx, w3
324 adc $0, R32(w0)
325 mul v1
326 mov $0, R32(w1)
327 add %rax, w3
328 mov 24(up,i,8), %rax
329 mov w2, 16(rp,i,8)
330 adc %rdx, w0
331 add $4, i
332 js L(m2top0)
334 mul v0
335 add %rax, w3
336 mov I(-8(up),-8(up,i,8)), %rax
337 mov w3, I(-8(rp),-8(rp,i,8))
338 adc %rdx, w0
339 adc R32(w1), R32(w1)
340 mul v1
341 add %rax, w0
342 adc %rdx, w1
343 mov w0, I((rp),(rp,i,8))
344 mov w1, I(8(rp),8(rp,i,8))
346 add $-2, vn_param
347 jz L(ret2)
349 L(do_am0):
350 push %r15
351 push vn_param
353 L(olo0):
354 mov (vp), v0
355 mov 8(vp), v1
356 lea 16(vp), vp
357 lea 16(rp), rp
358 mov (up,un,8), %rax
359 C lea 0(un), i
360 mov un, i
361 mul v0
362 mov %rax, X0
363 mov (up,un,8), %rax
364 MOV( %rdx, X1, 2)
365 mul v1
366 MOV( %rdx, w0, 4)
367 mov (rp,un,8), w2
368 mov %rax, w3
369 jmp L(lo0)
371 ALIGNx
372 L(am2top0):
373 mul v1
374 add w0, w1
375 adc %rax, w2
376 mov (up,i,8), %rax
377 MOV( %rdx, w3, 1)
378 adc $0, w3
379 mul v0
380 add w1, X1
381 mov X1, -8(rp,i,8)
382 adc %rax, X0
383 MOV( %rdx, X1, 2)
384 adc $0, X1
385 mov (up,i,8), %rax
386 mul v1
387 MOV( %rdx, w0, 4)
388 mov (rp,i,8), w1
389 add w1, w2
390 adc %rax, w3
391 adc $0, w0
392 L(lo0): mov 8(up,i,8), %rax
393 mul v0
394 add w2, X0
395 adc %rax, X1
396 mov X0, (rp,i,8)
397 MOV( %rdx, X0, 8)
398 adc $0, X0
399 mov 8(up,i,8), %rax
400 mov 8(rp,i,8), w2
401 mul v1
402 add w2, w3
403 adc %rax, w0
404 MOV( %rdx, w1, 16)
405 adc $0, w1
406 mov 16(up,i,8), %rax
407 mul v0
408 add w3, X1
409 mov X1, 8(rp,i,8)
410 adc %rax, X0
411 MOV( %rdx, X1, 32)
412 mov 16(rp,i,8), w3
413 adc $0, X1
414 mov 16(up,i,8), %rax
415 mul v1
416 add w3, w0
417 MOV( %rdx, w2, 64)
418 adc %rax, w1
419 mov 24(up,i,8), %rax
420 adc $0, w2
421 mul v0
422 add w0, X0
423 mov X0, 16(rp,i,8)
424 MOV( %rdx, X0, 128)
425 adc %rax, X1
426 mov 24(up,i,8), %rax
427 mov 24(rp,i,8), w0
428 adc $0, X0
429 add $4, i
430 jnc L(am2top0)
432 mul v1
433 add w0, w1
434 adc %rax, w2
435 adc Z(i,$0), %rdx
436 add w1, X1
437 adc Z(i,$0), X0
438 mov X1, I(-8(rp),-8(rp,i,8))
439 add w2, X0
440 mov X0, I((rp),(rp,i,8))
441 adc Z(i,$0), %rdx
442 mov %rdx, I(8(rp),8(rp,i,8))
444 addl $-2, vn
445 jnz L(olo0)
447 L(ret): pop %rax
448 pop %r15
449 L(ret2):pop %r14
450 pop %r13
451 pop %r12
452 pop %rbp
453 pop %rbx
454 FUNC_EXIT()
458 ALIGNx
459 L(m2top1):
460 mul v0
461 add %rax, w3
462 mov -8(up,i,8), %rax
463 mov w3, -8(rp,i,8)
464 adc %rdx, w0
465 adc $0, R32(w1)
466 L(m2e1):mul v1
467 add %rax, w0
468 adc %rdx, w1
469 mov $0, R32(w2)
470 mov (up,i,8), %rax
471 mul v0
472 add %rax, w0
473 mov w0, (rp,i,8)
474 adc %rdx, w1
475 mov (up,i,8), %rax
476 adc $0, R32(w2)
477 mul v1
478 add %rax, w1
479 adc %rdx, w2
480 mov 8(up,i,8), %rax
481 mul v0
482 mov $0, R32(w3)
483 add %rax, w1
484 adc %rdx, w2
485 adc $0, R32(w3)
486 mov 8(up,i,8), %rax
487 mul v1
488 add %rax, w2
489 mov w1, 8(rp,i,8)
490 adc %rdx, w3
491 mov $0, R32(w0)
492 mov 16(up,i,8), %rax
493 mul v0
494 add %rax, w2
495 mov 16(up,i,8), %rax
496 adc %rdx, w3
497 adc $0, R32(w0)
498 mul v1
499 mov $0, R32(w1)
500 add %rax, w3
501 mov 24(up,i,8), %rax
502 mov w2, 16(rp,i,8)
503 adc %rdx, w0
504 add $4, i
505 js L(m2top1)
507 mul v0
508 add %rax, w3
509 mov I(-8(up),-8(up,i,8)), %rax
510 mov w3, I(-8(rp),-8(rp,i,8))
511 adc %rdx, w0
512 adc R32(w1), R32(w1)
513 mul v1
514 add %rax, w0
515 adc %rdx, w1
516 mov w0, I((rp),(rp,i,8))
517 mov w1, I(8(rp),8(rp,i,8))
519 add $-2, vn_param
520 jz L(ret2)
522 L(do_am1):
523 push %r15
524 push vn_param
526 L(olo1):
527 mov (vp), v0
528 mov 8(vp), v1
529 lea 16(vp), vp
530 lea 16(rp), rp
531 mov (up,un,8), %rax
532 lea 1(un), i
533 mul v0
534 mov %rax, X1
535 MOV( %rdx, X0, 128)
536 mov (up,un,8), %rax
537 mov (rp,un,8), w1
538 mul v1
539 mov %rax, w2
540 mov 8(up,un,8), %rax
541 MOV( %rdx, w3, 1)
542 jmp L(lo1)
544 ALIGNx
545 L(am2top1):
546 mul v1
547 add w0, w1
548 adc %rax, w2
549 mov (up,i,8), %rax
550 MOV( %rdx, w3, 1)
551 adc $0, w3
552 L(lo1): mul v0
553 add w1, X1
554 mov X1, -8(rp,i,8)
555 adc %rax, X0
556 MOV( %rdx, X1, 2)
557 adc $0, X1
558 mov (up,i,8), %rax
559 mul v1
560 MOV( %rdx, w0, 4)
561 mov (rp,i,8), w1
562 add w1, w2
563 adc %rax, w3
564 adc $0, w0
565 mov 8(up,i,8), %rax
566 mul v0
567 add w2, X0
568 adc %rax, X1
569 mov X0, (rp,i,8)
570 MOV( %rdx, X0, 8)
571 adc $0, X0
572 mov 8(up,i,8), %rax
573 mov 8(rp,i,8), w2
574 mul v1
575 add w2, w3
576 adc %rax, w0
577 MOV( %rdx, w1, 16)
578 adc $0, w1
579 mov 16(up,i,8), %rax
580 mul v0
581 add w3, X1
582 mov X1, 8(rp,i,8)
583 adc %rax, X0
584 MOV( %rdx, X1, 32)
585 mov 16(rp,i,8), w3
586 adc $0, X1
587 mov 16(up,i,8), %rax
588 mul v1
589 add w3, w0
590 MOV( %rdx, w2, 64)
591 adc %rax, w1
592 mov 24(up,i,8), %rax
593 adc $0, w2
594 mul v0
595 add w0, X0
596 mov X0, 16(rp,i,8)
597 MOV( %rdx, X0, 128)
598 adc %rax, X1
599 mov 24(up,i,8), %rax
600 mov 24(rp,i,8), w0
601 adc $0, X0
602 add $4, i
603 jnc L(am2top1)
605 mul v1
606 add w0, w1
607 adc %rax, w2
608 adc Z(i,$0), %rdx
609 add w1, X1
610 adc Z(i,$0), X0
611 mov X1, I(-8(rp),-8(rp,i,8))
612 add w2, X0
613 mov X0, I((rp),(rp,i,8))
614 adc Z(i,$0), %rdx
615 mov %rdx, I(8(rp),8(rp,i,8))
617 addl $-2, vn
618 jnz L(olo1)
620 pop %rax
621 pop %r15
622 pop %r14
623 pop %r13
624 pop %r12
625 pop %rbp
626 pop %rbx
627 FUNC_EXIT()
631 ALIGNx
632 L(m2top2):
633 mul v0
634 add %rax, w3
635 mov -8(up,i,8), %rax
636 mov w3, -8(rp,i,8)
637 adc %rdx, w0
638 adc $0, R32(w1)
639 mul v1
640 add %rax, w0
641 adc %rdx, w1
642 mov $0, R32(w2)
643 mov (up,i,8), %rax
644 mul v0
645 add %rax, w0
646 mov w0, (rp,i,8)
647 adc %rdx, w1
648 mov (up,i,8), %rax
649 adc $0, R32(w2)
650 mul v1
651 add %rax, w1
652 adc %rdx, w2
653 mov 8(up,i,8), %rax
654 mul v0
655 mov $0, R32(w3)
656 add %rax, w1
657 adc %rdx, w2
658 adc $0, R32(w3)
659 mov 8(up,i,8), %rax
660 mul v1
661 add %rax, w2
662 mov w1, 8(rp,i,8)
663 adc %rdx, w3
664 mov $0, R32(w0)
665 mov 16(up,i,8), %rax
666 mul v0
667 add %rax, w2
668 mov 16(up,i,8), %rax
669 adc %rdx, w3
670 adc $0, R32(w0)
671 L(m2e2):mul v1
672 mov $0, R32(w1)
673 add %rax, w3
674 mov 24(up,i,8), %rax
675 mov w2, 16(rp,i,8)
676 adc %rdx, w0
677 add $4, i
678 js L(m2top2)
680 mul v0
681 add %rax, w3
682 mov I(-8(up),-8(up,i,8)), %rax
683 mov w3, I(-8(rp),-8(rp,i,8))
684 adc %rdx, w0
685 adc R32(w1), R32(w1)
686 mul v1
687 add %rax, w0
688 adc %rdx, w1
689 mov w0, I((rp),(rp,i,8))
690 mov w1, I(8(rp),8(rp,i,8))
692 add $-2, vn_param
693 jz L(ret2)
695 L(do_am2):
696 push %r15
697 push vn_param
699 L(olo2):
700 mov (vp), v0
701 mov 8(vp), v1
702 lea 16(vp), vp
703 lea 16(rp), rp
704 mov (up,un,8), %rax
705 lea -2(un), i
706 mul v0
707 mov %rax, X0
708 MOV( %rdx, X1, 32)
709 mov (up,un,8), %rax
710 mov (rp,un,8), w0
711 mul v1
712 mov %rax, w1
713 lea (%rdx), w2
714 mov 8(up,un,8), %rax
715 jmp L(lo2)
717 ALIGNx
718 L(am2top2):
719 mul v1
720 add w0, w1
721 adc %rax, w2
722 mov (up,i,8), %rax
723 MOV( %rdx, w3, 1)
724 adc $0, w3
725 mul v0
726 add w1, X1
727 mov X1, -8(rp,i,8)
728 adc %rax, X0
729 MOV( %rdx, X1, 2)
730 adc $0, X1
731 mov (up,i,8), %rax
732 mul v1
733 MOV( %rdx, w0, 4)
734 mov (rp,i,8), w1
735 add w1, w2
736 adc %rax, w3
737 adc $0, w0
738 mov 8(up,i,8), %rax
739 mul v0
740 add w2, X0
741 adc %rax, X1
742 mov X0, (rp,i,8)
743 MOV( %rdx, X0, 8)
744 adc $0, X0
745 mov 8(up,i,8), %rax
746 mov 8(rp,i,8), w2
747 mul v1
748 add w2, w3
749 adc %rax, w0
750 MOV( %rdx, w1, 16)
751 adc $0, w1
752 mov 16(up,i,8), %rax
753 mul v0
754 add w3, X1
755 mov X1, 8(rp,i,8)
756 adc %rax, X0
757 MOV( %rdx, X1, 32)
758 mov 16(rp,i,8), w3
759 adc $0, X1
760 mov 16(up,i,8), %rax
761 mul v1
762 add w3, w0
763 MOV( %rdx, w2, 64)
764 adc %rax, w1
765 mov 24(up,i,8), %rax
766 adc $0, w2
767 L(lo2): mul v0
768 add w0, X0
769 mov X0, 16(rp,i,8)
770 MOV( %rdx, X0, 128)
771 adc %rax, X1
772 mov 24(up,i,8), %rax
773 mov 24(rp,i,8), w0
774 adc $0, X0
775 add $4, i
776 jnc L(am2top2)
778 mul v1
779 add w0, w1
780 adc %rax, w2
781 adc Z(i,$0), %rdx
782 add w1, X1
783 adc Z(i,$0), X0
784 mov X1, I(-8(rp),-8(rp,i,8))
785 add w2, X0
786 mov X0, I((rp),(rp,i,8))
787 adc Z(i,$0), %rdx
788 mov %rdx, I(8(rp),8(rp,i,8))
790 addl $-2, vn
791 jnz L(olo2)
793 pop %rax
794 pop %r15
795 pop %r14
796 pop %r13
797 pop %r12
798 pop %rbp
799 pop %rbx
800 FUNC_EXIT()
804 ALIGNx
805 L(m2top3):
806 mul v0
807 add %rax, w3
808 mov -8(up,i,8), %rax
809 mov w3, -8(rp,i,8)
810 adc %rdx, w0
811 adc $0, R32(w1)
812 mul v1
813 add %rax, w0
814 adc %rdx, w1
815 mov $0, R32(w2)
816 mov (up,i,8), %rax
817 mul v0
818 add %rax, w0
819 mov w0, (rp,i,8)
820 adc %rdx, w1
821 mov (up,i,8), %rax
822 adc $0, R32(w2)
823 mul v1
824 add %rax, w1
825 adc %rdx, w2
826 mov 8(up,i,8), %rax
827 mul v0
828 mov $0, R32(w3)
829 add %rax, w1
830 adc %rdx, w2
831 adc $0, R32(w3)
832 mov 8(up,i,8), %rax
833 L(m2e3):mul v1
834 add %rax, w2
835 mov w1, 8(rp,i,8)
836 adc %rdx, w3
837 mov $0, R32(w0)
838 mov 16(up,i,8), %rax
839 mul v0
840 add %rax, w2
841 mov 16(up,i,8), %rax
842 adc %rdx, w3
843 adc $0, R32(w0)
844 mul v1
845 mov $0, R32(w1)
846 add %rax, w3
847 mov 24(up,i,8), %rax
848 mov w2, 16(rp,i,8)
849 adc %rdx, w0
850 add $4, i
851 js L(m2top3)
853 mul v0
854 add %rax, w3
855 mov I(-8(up),-8(up,i,8)), %rax
856 mov w3, I(-8(rp),-8(rp,i,8))
857 adc %rdx, w0
858 adc $0, R32(w1)
859 mul v1
860 add %rax, w0
861 adc %rdx, w1
862 mov w0, I((rp),(rp,i,8))
863 mov w1, I(8(rp),8(rp,i,8))
865 add $-2, vn_param
866 jz L(ret2)
868 L(do_am3):
869 push %r15
870 push vn_param
872 L(olo3):
873 mov (vp), v0
874 mov 8(vp), v1
875 lea 16(vp), vp
876 lea 16(rp), rp
877 mov (up,un,8), %rax
878 lea -1(un), i
879 mul v0
880 mov %rax, X1
881 MOV( %rdx, X0, 8)
882 mov (up,un,8), %rax
883 mov (rp,un,8), w3
884 mul v1
885 mov %rax, w0
886 MOV( %rdx, w1, 16)
887 mov 8(up,un,8), %rax
888 jmp L(lo3)
890 ALIGNx
891 L(am2top3):
892 mul v1
893 add w0, w1
894 adc %rax, w2
895 mov (up,i,8), %rax
896 MOV( %rdx, w3, 1)
897 adc $0, w3
898 mul v0
899 add w1, X1
900 mov X1, -8(rp,i,8)
901 adc %rax, X0
902 MOV( %rdx, X1, 2)
903 adc $0, X1
904 mov (up,i,8), %rax
905 mul v1
906 MOV( %rdx, w0, 4)
907 mov (rp,i,8), w1
908 add w1, w2
909 adc %rax, w3
910 adc $0, w0
911 mov 8(up,i,8), %rax
912 mul v0
913 add w2, X0
914 adc %rax, X1
915 mov X0, (rp,i,8)
916 MOV( %rdx, X0, 8)
917 adc $0, X0
918 mov 8(up,i,8), %rax
919 mov 8(rp,i,8), w2
920 mul v1
921 add w2, w3
922 adc %rax, w0
923 MOV( %rdx, w1, 16)
924 adc $0, w1
925 mov 16(up,i,8), %rax
926 L(lo3): mul v0
927 add w3, X1
928 mov X1, 8(rp,i,8)
929 adc %rax, X0
930 MOV( %rdx, X1, 32)
931 mov 16(rp,i,8), w3
932 adc $0, X1
933 mov 16(up,i,8), %rax
934 mul v1
935 add w3, w0
936 MOV( %rdx, w2, 64)
937 adc %rax, w1
938 mov 24(up,i,8), %rax
939 adc $0, w2
940 mul v0
941 add w0, X0
942 mov X0, 16(rp,i,8)
943 MOV( %rdx, X0, 128)
944 adc %rax, X1
945 mov 24(up,i,8), %rax
946 mov 24(rp,i,8), w0
947 adc $0, X0
948 add $4, i
949 jnc L(am2top3)
951 mul v1
952 add w0, w1
953 adc %rax, w2
954 adc Z(i,$0), %rdx
955 add w1, X1
956 adc Z(i,$0), X0
957 mov X1, I(-8(rp),-8(rp,i,8))
958 add w2, X0
959 mov X0, I((rp),(rp,i,8))
960 adc Z(i,$0), %rdx
961 mov %rdx, I(8(rp),8(rp,i,8))
963 addl $-2, vn
964 jnz L(olo3)
966 pop %rax
967 pop %r15
968 pop %r14
969 pop %r13
970 pop %r12
971 pop %rbp
972 pop %rbx
973 FUNC_EXIT()
975 EPILOGUE()