beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / core2 / sqr_basecase.asm
bloba112c1b52ecbf3e6bc40d6195933a302ab808f20
1 dnl X86-64 mpn_sqr_basecase optimised for Intel Nehalem/Westmere.
2 dnl It also seems good for Conroe/Wolfdale.
4 dnl Contributed to the GNU project by Torbjörn Granlund.
6 dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
9 dnl
10 dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 dnl it under the terms of either:
12 dnl
13 dnl * the GNU Lesser General Public License as published by the Free
14 dnl Software Foundation; either version 3 of the License, or (at your
15 dnl option) any later version.
16 dnl
17 dnl or
18 dnl
19 dnl * the GNU General Public License as published by the Free Software
20 dnl Foundation; either version 2 of the License, or (at your option) any
21 dnl later version.
22 dnl
23 dnl or both in parallel, as here.
24 dnl
25 dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 dnl for more details.
29 dnl
30 dnl You should have received copies of the GNU General Public License and the
31 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 dnl see https://www.gnu.org/licenses/.
34 include(`../config.m4')
36 C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1
37 C AMD K8,K9
38 C AMD K10
39 C AMD bull
40 C AMD pile
41 C AMD steam
42 C AMD bobcat
43 C AMD jaguar
44 C Intel P4
45 C Intel core 4.9 4.18-4.25 3.87
46 C Intel NHM 3.8 4.06-4.2 3.5
47 C Intel SBR
48 C Intel IBR
49 C Intel HWL
50 C Intel BWL
51 C Intel atom
52 C VIA nano
54 C The inner loops of this code are the result of running a code generation and
55 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
57 C Code structure:
60 C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4)
61 C | | | |
62 C | | | |
63 C | | | |
64 C \|/ \|/ \|/ \|/
65 C ____________ ____________
66 C / \ / \
67 C \|/ \ \|/ \
68 C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4)
69 C \ /|\ \ /|\
70 C \____________/ \____________/
71 C \ /
72 C \ /
73 C \ /
74 C tail(0m2) tail(1m2)
75 C \ /
76 C \ /
77 C sqr_diag_addlsh1
79 C TODO
80 C * Tune. None done so far.
81 C * Currently 2761 bytes, making it smaller would be nice.
82 C * Consider using a jumptab-based entry sequence. One might even use a mask-
83 C less sequence, if the table is large enough to support tuneup's needs.
84 C The code would be, using non-PIC code,
85 C lea tab(%rip),%rax; jmp *(n,%rax)
86 C or,
87 C lea tab(%rip),%rax; lea (%rip),%rbx; add (n,%rax),%rbx; jmp *%rbx
88 C using PIC code. The table entries would be Ln1,Ln2,Ln3,Lm0,Lm1,Lm2,Lm3,..
89 C with the last four entries repeated a safe number of times.
90 C * Consider expanding feed-in code in order to avoid zeroing registers.
91 C * Zero consistently with xor.
92 C * Check if using "lea (reg),reg" should be done in more places; we have some
93 C explicit "mov %rax,reg" now.
94 C * Try zeroing with xor in m2 loops.
95 C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
96 C between loop header and wind-down code.
97 C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte.
99 C When playing with pointers, set this to $2 to fall back to conservative
100 C indexing in wind-down code.
101 define(`I',`$1')
103 C Define this to $1 to use late loop index variable as zero, $2 to use an
104 C explicit $0.
105 define(`Z',`$1')
107 define(`rp', `%rdi')
108 define(`up', `%rsi')
109 define(`n_param', `%rdx')
111 define(`n', `%r8')
113 define(`v0', `%r10')
114 define(`v1', `%r11')
115 define(`w0', `%rbx')
116 define(`w1', `%rcx')
117 define(`w2', `%rbp')
118 define(`w3', `%r9')
119 define(`i', `%r13')
121 define(`X0', `%r12')
122 define(`X1', `%r14')
124 C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
126 ABI_SUPPORT(DOS64)
127 ABI_SUPPORT(STD64)
129 define(`ALIGNx', `ALIGN(16)')
131 define(`N', 85)
132 ifdef(`N',,`define(`N',0)')
133 define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
135 ASM_START()
136 TEXT
137 ALIGN(32)
138 PROLOGUE(mpn_sqr_basecase)
139 FUNC_ENTRY(3)
141 cmp $4, n_param
142 jl L(small)
144 push %rbx
145 push %rbp
146 push %r12
147 push %r13
148 push %r14
150 mov (up), v0
151 mov 8(up), %rax
152 mov %rax, v1
154 mov $1, R32(n)
155 sub n_param, n C n = -n_param+1
156 push n
158 lea (up,n_param,8), up
159 lea (rp,n_param,8), rp
161 mul v0
163 test $1, R8(n)
164 jnz L(bx1)
166 L(bx0): test $2, R8(n)
167 mov %rax, (rp,n,8)
168 jnz L(b10)
170 L(b00): lea (n), i C n = 5, 9, ...
171 mov %rdx, w1 C FIXME: Use lea?
172 xor R32(w2), R32(w2)
173 jmp L(m2e0)
175 L(b10): lea 2(n), i C n = 7, 11, ...
176 mov 8(up,n,8), %rax
177 mov %rdx, w3 C FIXME: Use lea?
178 xor R32(w0), R32(w0)
179 xor R32(w1), R32(w1)
180 jmp L(m2e2)
182 L(bx1): test $2, R8(n)
183 mov %rax, (rp,n,8)
184 jz L(b11)
186 L(b01): lea 1(n), i C n = 6, 10, ...
187 mov %rdx, w0 C FIXME: Use lea?
188 xor R32(w1), R32(w1)
189 jmp L(m2e1)
191 L(b11): lea -1(n), i C n = 4, 8, 12, ...
192 mov %rdx, w2 C FIXME: Use lea?
193 xor R32(w3), R32(w3)
194 jmp L(m2e3)
197 ALIGNx
198 L(m2top1):
199 mul v0
200 add %rax, w3
201 mov -8(up,i,8), %rax
202 mov w3, -8(rp,i,8)
203 adc %rdx, w0
204 adc $0, R32(w1)
205 mul v1
206 add %rax, w0
207 adc %rdx, w1
208 L(m2e1):mov $0, R32(w2)
209 mov (up,i,8), %rax
210 mul v0
211 add %rax, w0
212 mov w0, (rp,i,8)
213 adc %rdx, w1
214 mov (up,i,8), %rax
215 adc $0, R32(w2)
216 mul v1
217 add %rax, w1
218 adc %rdx, w2
219 mov 8(up,i,8), %rax
220 mul v0
221 mov $0, R32(w3)
222 add %rax, w1
223 adc %rdx, w2
224 adc $0, R32(w3)
225 mov 8(up,i,8), %rax
226 mul v1
227 add %rax, w2
228 mov w1, 8(rp,i,8)
229 adc %rdx, w3
230 mov $0, R32(w0)
231 mov 16(up,i,8), %rax
232 mul v0
233 add %rax, w2
234 mov 16(up,i,8), %rax
235 adc %rdx, w3
236 adc $0, R32(w0)
237 mul v1
238 mov $0, R32(w1)
239 add %rax, w3
240 mov 24(up,i,8), %rax
241 mov w2, 16(rp,i,8)
242 adc %rdx, w0
243 add $4, i
244 js L(m2top1)
246 mul v0
247 add %rax, w3
248 mov I(-8(up),-8(up,i,8)), %rax
249 mov w3, I(-8(rp),-8(rp,i,8))
250 adc %rdx, w0
251 adc R32(w1), R32(w1)
252 mul v1
253 add w0, %rax
254 adc w1, %rdx
255 mov %rax, I((rp),(rp,i,8))
256 mov %rdx, I(8(rp),8(rp,i,8))
258 lea 16(rp), rp
259 add $2, n C decrease |n|
260 jmp L(am2o3)
262 ALIGNx
263 L(m2top3):
264 mul v0
265 add %rax, w3
266 mov -8(up,i,8), %rax
267 mov w3, -8(rp,i,8)
268 adc %rdx, w0
269 adc $0, R32(w1)
270 mul v1
271 add %rax, w0
272 adc %rdx, w1
273 mov $0, R32(w2)
274 mov (up,i,8), %rax
275 mul v0
276 add %rax, w0
277 mov w0, (rp,i,8)
278 adc %rdx, w1
279 mov (up,i,8), %rax
280 adc $0, R32(w2)
281 mul v1
282 add %rax, w1
283 adc %rdx, w2
284 mov 8(up,i,8), %rax
285 mul v0
286 mov $0, R32(w3)
287 add %rax, w1
288 adc %rdx, w2
289 adc $0, R32(w3)
290 mov 8(up,i,8), %rax
291 mul v1
292 add %rax, w2
293 mov w1, 8(rp,i,8)
294 adc %rdx, w3
295 L(m2e3):mov $0, R32(w0)
296 mov 16(up,i,8), %rax
297 mul v0
298 add %rax, w2
299 mov 16(up,i,8), %rax
300 adc %rdx, w3
301 adc $0, R32(w0)
302 mul v1
303 mov $0, R32(w1)
304 add %rax, w3
305 mov 24(up,i,8), %rax
306 mov w2, 16(rp,i,8)
307 adc %rdx, w0
308 add $4, i
309 js L(m2top3)
311 mul v0
312 add %rax, w3
313 mov I(-8(up),-8(up,i,8)), %rax
314 mov w3, I(-8(rp),-8(rp,i,8))
315 adc %rdx, w0
316 adc R32(w1), R32(w1)
317 mul v1
318 add w0, %rax
319 adc w1, %rdx
320 mov %rax, I((rp),(rp,i,8))
321 mov %rdx, I(8(rp),8(rp,i,8))
323 lea 16(rp), rp
324 add $2, n C decrease |n|
325 cmp $-1, n
326 jz L(cor1) C jumps iff entry n = 4
328 L(am2o1):
329 mov -8(up,n,8), v0
330 mov (up,n,8), %rax
331 mov %rax, v1
332 lea 1(n), i
333 mul v0
334 mov %rax, X1
335 MOV( %rdx, X0, 128)
336 mov (rp,n,8), w1
337 xor R32(w2), R32(w2)
338 mov 8(up,n,8), %rax
339 xor R32(w3), R32(w3)
340 jmp L(lo1)
342 ALIGNx
343 L(am2top1):
344 mul v1
345 add w0, w1
346 adc %rax, w2
347 mov (up,i,8), %rax
348 MOV( %rdx, w3, 1)
349 adc $0, w3
350 L(lo1): mul v0
351 add w1, X1
352 mov X1, -8(rp,i,8)
353 adc %rax, X0
354 MOV( %rdx, X1, 2)
355 adc $0, X1
356 mov (up,i,8), %rax
357 mul v1
358 MOV( %rdx, w0, 4)
359 mov (rp,i,8), w1
360 add w1, w2
361 adc %rax, w3
362 adc $0, w0
363 mov 8(up,i,8), %rax
364 mul v0
365 add w2, X0
366 adc %rax, X1
367 mov X0, (rp,i,8)
368 MOV( %rdx, X0, 8)
369 adc $0, X0
370 mov 8(up,i,8), %rax
371 mov 8(rp,i,8), w2
372 mul v1
373 add w2, w3
374 adc %rax, w0
375 MOV( %rdx, w1, 16)
376 adc $0, w1
377 mov 16(up,i,8), %rax
378 mul v0
379 add w3, X1
380 mov X1, 8(rp,i,8)
381 adc %rax, X0
382 MOV( %rdx, X1, 32)
383 mov 16(rp,i,8), w3
384 adc $0, X1
385 mov 16(up,i,8), %rax
386 mul v1
387 add w3, w0
388 MOV( %rdx, w2, 64)
389 adc %rax, w1
390 mov 24(up,i,8), %rax
391 adc $0, w2
392 mul v0
393 add w0, X0
394 mov X0, 16(rp,i,8)
395 MOV( %rdx, X0, 128)
396 adc %rax, X1
397 mov 24(up,i,8), %rax
398 mov 24(rp,i,8), w0
399 adc $0, X0
400 add $4, i
401 jnc L(am2top1)
403 mul v1
404 add w0, w1
405 adc w2, %rax
406 adc Z(i,$0), %rdx
407 add w1, X1
408 adc Z(i,$0), X0
409 mov X1, I(-8(rp),-8(rp,i,8))
410 add X0, %rax
411 mov %rax, I((rp),(rp,i,8))
412 adc Z(i,$0), %rdx
413 mov %rdx, I(8(rp),8(rp,i,8))
415 lea 16(rp), rp
416 add $2, n
418 L(am2o3):
419 mov -8(up,n,8), v0
420 mov (up,n,8), %rax
421 mov %rax, v1
422 lea -1(n), i
423 mul v0
424 mov %rax, X1
425 MOV( %rdx, X0, 8)
426 mov (rp,n,8), w3
427 xor R32(w0), R32(w0)
428 xor R32(w1), R32(w1)
429 mov 8(up,n,8), %rax
430 jmp L(lo3)
432 ALIGNx
433 L(am2top3):
434 mul v1
435 add w0, w1
436 adc %rax, w2
437 mov (up,i,8), %rax
438 MOV( %rdx, w3, 1)
439 adc $0, w3
440 mul v0
441 add w1, X1
442 mov X1, -8(rp,i,8)
443 adc %rax, X0
444 MOV( %rdx, X1, 2)
445 adc $0, X1
446 mov (up,i,8), %rax
447 mul v1
448 MOV( %rdx, w0, 4)
449 mov (rp,i,8), w1
450 add w1, w2
451 adc %rax, w3
452 adc $0, w0
453 mov 8(up,i,8), %rax
454 mul v0
455 add w2, X0
456 adc %rax, X1
457 mov X0, (rp,i,8)
458 MOV( %rdx, X0, 8)
459 adc $0, X0
460 mov 8(up,i,8), %rax
461 mov 8(rp,i,8), w2
462 mul v1
463 add w2, w3
464 adc %rax, w0
465 MOV( %rdx, w1, 16)
466 adc $0, w1
467 mov 16(up,i,8), %rax
468 L(lo3): mul v0
469 add w3, X1
470 mov X1, 8(rp,i,8)
471 adc %rax, X0
472 MOV( %rdx, X1, 32)
473 mov 16(rp,i,8), w3
474 adc $0, X1
475 mov 16(up,i,8), %rax
476 mul v1
477 add w3, w0
478 MOV( %rdx, w2, 64)
479 adc %rax, w1
480 mov 24(up,i,8), %rax
481 adc $0, w2
482 mul v0
483 add w0, X0
484 mov X0, 16(rp,i,8)
485 MOV( %rdx, X0, 128)
486 adc %rax, X1
487 mov 24(up,i,8), %rax
488 mov 24(rp,i,8), w0
489 adc $0, X0
490 add $4, i
491 jnc L(am2top3)
493 mul v1
494 add w0, w1
495 adc w2, %rax
496 adc Z(i,$0), %rdx
497 add w1, X1
498 adc Z(i,$0), X0
499 mov X1, I(-8(rp),-8(rp,i,8))
500 add X0, %rax
501 mov %rax, I((rp),(rp,i,8))
502 adc Z(i,$0), %rdx
503 mov %rdx, I(8(rp),8(rp,i,8))
505 lea 16(rp), rp
506 add $2, n
507 cmp $-1, n
508 jnz L(am2o1)
510 L(cor1):pop n
511 mov %rdx, w3
512 mov -16(up), v0
513 mov -8(up), %rax
514 mul v0
515 add w3, %rax
516 adc $0, %rdx
517 mov %rax, -8(rp)
518 mov %rdx, (rp)
519 jmp L(sqr_diag_addlsh1)
521 ALIGNx
522 L(m2top2):
523 L(m2e2):mul v0
524 add %rax, w3
525 mov -8(up,i,8), %rax
526 mov w3, -8(rp,i,8)
527 adc %rdx, w0
528 adc $0, R32(w1)
529 mul v1
530 add %rax, w0
531 adc %rdx, w1
532 mov $0, R32(w2)
533 mov (up,i,8), %rax
534 mul v0
535 add %rax, w0
536 mov w0, (rp,i,8)
537 adc %rdx, w1
538 mov (up,i,8), %rax
539 adc $0, R32(w2)
540 mul v1
541 add %rax, w1
542 adc %rdx, w2
543 mov 8(up,i,8), %rax
544 mul v0
545 mov $0, R32(w3)
546 add %rax, w1
547 adc %rdx, w2
548 adc $0, R32(w3)
549 mov 8(up,i,8), %rax
550 mul v1
551 add %rax, w2
552 mov w1, 8(rp,i,8)
553 adc %rdx, w3
554 mov $0, R32(w0)
555 mov 16(up,i,8), %rax
556 mul v0
557 add %rax, w2
558 mov 16(up,i,8), %rax
559 adc %rdx, w3
560 adc $0, R32(w0)
561 mul v1
562 mov $0, R32(w1)
563 add %rax, w3
564 mov 24(up,i,8), %rax
565 mov w2, 16(rp,i,8)
566 adc %rdx, w0
567 add $4, i
568 js L(m2top2)
570 mul v0
571 add %rax, w3
572 mov I(-8(up),-8(up,i,8)), %rax
573 mov w3, I(-8(rp),-8(rp,i,8))
574 adc %rdx, w0
575 adc R32(w1), R32(w1)
576 mul v1
577 add w0, %rax
578 adc w1, %rdx
579 mov %rax, I((rp),(rp,i,8))
580 mov %rdx, I(8(rp),8(rp,i,8))
582 lea 16(rp), rp
583 add $2, n C decrease |n|
584 jmp L(am2o0)
586 ALIGNx
587 L(m2top0):
588 mul v0
589 add %rax, w3
590 mov -8(up,i,8), %rax
591 mov w3, -8(rp,i,8)
592 adc %rdx, w0
593 adc $0, R32(w1)
594 mul v1
595 add %rax, w0
596 adc %rdx, w1
597 mov $0, R32(w2)
598 mov (up,i,8), %rax
599 mul v0
600 add %rax, w0
601 mov w0, (rp,i,8)
602 adc %rdx, w1
603 mov (up,i,8), %rax
604 adc $0, R32(w2)
605 mul v1
606 add %rax, w1
607 adc %rdx, w2
608 L(m2e0):mov 8(up,i,8), %rax
609 mul v0
610 mov $0, R32(w3)
611 add %rax, w1
612 adc %rdx, w2
613 adc $0, R32(w3)
614 mov 8(up,i,8), %rax
615 mul v1
616 add %rax, w2
617 mov w1, 8(rp,i,8)
618 adc %rdx, w3
619 mov $0, R32(w0)
620 mov 16(up,i,8), %rax
621 mul v0
622 add %rax, w2
623 mov 16(up,i,8), %rax
624 adc %rdx, w3
625 adc $0, R32(w0)
626 mul v1
627 mov $0, R32(w1)
628 add %rax, w3
629 mov 24(up,i,8), %rax
630 mov w2, 16(rp,i,8)
631 adc %rdx, w0
632 add $4, i
633 js L(m2top0)
635 mul v0
636 add %rax, w3
637 mov I(-8(up),-8(up,i,8)), %rax
638 mov w3, I(-8(rp),-8(rp,i,8))
639 adc %rdx, w0
640 adc R32(w1), R32(w1)
641 mul v1
642 add w0, %rax
643 adc w1, %rdx
644 mov %rax, I((rp),(rp,i,8))
645 mov %rdx, I(8(rp),8(rp,i,8))
647 lea 16(rp), rp
648 add $2, n C decrease |n|
649 cmp $-2, n
650 jz L(cor2) C jumps iff entry n = 5
652 L(am2o2):
653 mov -8(up,n,8), v0
654 mov (up,n,8), %rax
655 mov %rax, v1
656 lea -2(n), i
657 mul v0
658 mov %rax, X0
659 MOV( %rdx, X1, 32)
660 mov (rp,n,8), w0
661 xor R32(w1), R32(w1)
662 xor R32(w2), R32(w2)
663 mov 8(up,n,8), %rax
664 jmp L(lo2)
666 ALIGNx
667 L(am2top2):
668 mul v1
669 add w0, w1
670 adc %rax, w2
671 mov (up,i,8), %rax
672 MOV( %rdx, w3, 1)
673 adc $0, w3
674 mul v0
675 add w1, X1
676 mov X1, -8(rp,i,8)
677 adc %rax, X0
678 MOV( %rdx, X1, 2)
679 adc $0, X1
680 mov (up,i,8), %rax
681 mul v1
682 MOV( %rdx, w0, 4)
683 mov (rp,i,8), w1
684 add w1, w2
685 adc %rax, w3
686 adc $0, w0
687 mov 8(up,i,8), %rax
688 mul v0
689 add w2, X0
690 adc %rax, X1
691 mov X0, (rp,i,8)
692 MOV( %rdx, X0, 8)
693 adc $0, X0
694 mov 8(up,i,8), %rax
695 mov 8(rp,i,8), w2
696 mul v1
697 add w2, w3
698 adc %rax, w0
699 MOV( %rdx, w1, 16)
700 adc $0, w1
701 mov 16(up,i,8), %rax
702 mul v0
703 add w3, X1
704 mov X1, 8(rp,i,8)
705 adc %rax, X0
706 MOV( %rdx, X1, 32)
707 mov 16(rp,i,8), w3
708 adc $0, X1
709 mov 16(up,i,8), %rax
710 mul v1
711 add w3, w0
712 MOV( %rdx, w2, 64)
713 adc %rax, w1
714 mov 24(up,i,8), %rax
715 adc $0, w2
716 L(lo2): mul v0
717 add w0, X0
718 mov X0, 16(rp,i,8)
719 MOV( %rdx, X0, 128)
720 adc %rax, X1
721 mov 24(up,i,8), %rax
722 mov 24(rp,i,8), w0
723 adc $0, X0
724 add $4, i
725 jnc L(am2top2)
727 mul v1
728 add w0, w1
729 adc w2, %rax
730 adc Z(i,$0), %rdx
731 add w1, X1
732 adc Z(i,$0), X0
733 mov X1, I(-8(rp),-8(rp,i,8))
734 add X0, %rax
735 mov %rax, I((rp),(rp,i,8))
736 adc Z(i,$0), %rdx
737 mov %rdx, I(8(rp),8(rp,i,8))
739 lea 16(rp), rp
740 add $2, n
742 L(am2o0):
743 mov -8(up,n,8), v0
744 mov (up,n,8), %rax
745 mov %rax, v1
746 lea 0(n), i
747 mul v0
748 mov %rax, X0
749 MOV( %rdx, X1, 2)
750 xor R32(w0), R32(w0)
751 mov (rp,n,8), w2
752 xor R32(w3), R32(w3)
753 jmp L(lo0)
755 ALIGNx
756 L(am2top0):
757 mul v1
758 add w0, w1
759 adc %rax, w2
760 mov (up,i,8), %rax
761 MOV( %rdx, w3, 1)
762 adc $0, w3
763 mul v0
764 add w1, X1
765 mov X1, -8(rp,i,8)
766 adc %rax, X0
767 MOV( %rdx, X1, 2)
768 adc $0, X1
769 mov (up,i,8), %rax
770 mul v1
771 MOV( %rdx, w0, 4)
772 mov (rp,i,8), w1
773 add w1, w2
774 adc %rax, w3
775 adc $0, w0
776 L(lo0): mov 8(up,i,8), %rax
777 mul v0
778 add w2, X0
779 adc %rax, X1
780 mov X0, (rp,i,8)
781 MOV( %rdx, X0, 8)
782 adc $0, X0
783 mov 8(up,i,8), %rax
784 mov 8(rp,i,8), w2
785 mul v1
786 add w2, w3
787 adc %rax, w0
788 MOV( %rdx, w1, 16)
789 adc $0, w1
790 mov 16(up,i,8), %rax
791 mul v0
792 add w3, X1
793 mov X1, 8(rp,i,8)
794 adc %rax, X0
795 MOV( %rdx, X1, 32)
796 mov 16(rp,i,8), w3
797 adc $0, X1
798 mov 16(up,i,8), %rax
799 mul v1
800 add w3, w0
801 MOV( %rdx, w2, 64)
802 adc %rax, w1
803 mov 24(up,i,8), %rax
804 adc $0, w2
805 mul v0
806 add w0, X0
807 mov X0, 16(rp,i,8)
808 MOV( %rdx, X0, 128)
809 adc %rax, X1
810 mov 24(up,i,8), %rax
811 mov 24(rp,i,8), w0
812 adc $0, X0
813 add $4, i
814 jnc L(am2top0)
816 mul v1
817 add w0, w1
818 adc w2, %rax
819 adc Z(i,$0), %rdx
820 add w1, X1
821 adc Z(i,$0), X0
822 mov X1, I(-8(rp),-8(rp,i,8))
823 add X0, %rax
824 mov %rax, I((rp),(rp,i,8))
825 adc Z(i,$0), %rdx
826 mov %rdx, I(8(rp),8(rp,i,8))
828 lea 16(rp), rp
829 add $2, n
830 cmp $-2, n
831 jnz L(am2o2)
833 L(cor2):pop n
834 mov -24(up), v0
835 mov %rax, w2
836 mov %rdx, w0
837 mov -16(up), %rax
838 mov %rax, v1
839 mul v0
840 mov %rax, X0
841 MOV( %rdx, X1, 32)
842 mov -8(up), %rax
843 mul v0
844 add w2, X0
845 mov X0, -16(rp)
846 MOV( %rdx, X0, 128)
847 adc %rax, X1
848 mov -8(up), %rax
849 adc $0, X0
850 mul v1
851 add w0, X1
852 adc $0, X0
853 mov X1, -8(rp)
854 add X0, %rax
855 mov %rax, (rp)
856 adc $0, %rdx
857 mov %rdx, 8(rp)
858 lea 8(rp), rp
860 L(sqr_diag_addlsh1):
861 mov -8(up,n,8), %rax
862 shl n
863 xor R32(%rbx), R32(%rbx)
864 mul %rax
865 mov 8(rp,n,8), %r11
866 lea (%rdx), %r10
867 mov 16(rp,n,8), %r9
868 add %r11, %r11
869 jmp L(dm)
871 ALIGNx
872 L(dtop):mul %rax
873 add %r11, %r10
874 mov 8(rp,n,8), %r11
875 mov %r10, -8(rp,n,8)
876 adc %r9, %rax
877 lea (%rdx,%rbx), %r10
878 mov 16(rp,n,8), %r9
879 adc %r11, %r11
880 L(dm): mov %rax, (rp,n,8)
881 mov (up,n,4), %rax
882 adc %r9, %r9
883 setc R8(%rbx)
884 add $2, n
885 js L(dtop)
887 mul %rax
888 add %r11, %r10
889 mov %r10, -8(rp)
890 adc %r9, %rax
891 lea (%rdx,%rbx), %r10
892 mov %rax, (rp)
893 adc $0, %r10
894 mov %r10, 8(rp)
896 pop %r14
897 pop %r13
898 pop %r12
899 pop %rbp
900 pop %rbx
901 FUNC_EXIT()
904 ALIGN(16)
905 L(small):
906 mov (up), %rax
907 cmp $2, n_param
908 jae L(gt1)
909 L(n1):
910 mul %rax
911 mov %rax, (rp)
912 mov %rdx, 8(rp)
913 FUNC_EXIT()
916 L(gt1): jne L(gt2)
917 L(n2): mov %rax, %r8
918 mul %rax
919 mov 8(up), %r11
920 mov %rax, (rp)
921 mov %r11, %rax
922 mov %rdx, %r9
923 mul %rax
924 mov %rax, %r10
925 mov %r11, %rax
926 mov %rdx, %r11
927 mul %r8
928 xor %r8, %r8
929 add %rax, %r9
930 adc %rdx, %r10
931 adc %r8, %r11
932 add %rax, %r9
933 mov %r9, 8(rp)
934 adc %rdx, %r10
935 mov %r10, 16(rp)
936 adc %r8, %r11
937 mov %r11, 24(rp)
938 FUNC_EXIT()
941 L(gt2):
942 L(n3): mov %rax, %r10
943 mul %rax
944 mov 8(up), %r11
945 mov %rax, (rp)
946 mov %r11, %rax
947 mov %rdx, 8(rp)
948 mul %rax
949 mov 16(up), %rcx
950 mov %rax, 16(rp)
951 mov %rcx, %rax
952 mov %rdx, 24(rp)
953 mul %rax
954 mov %rax, 32(rp)
955 mov %rdx, 40(rp)
957 mov %r11, %rax
958 mul %r10
959 mov %rax, %r8
960 mov %rcx, %rax
961 mov %rdx, %r9
962 mul %r10
963 xor %r10, %r10
964 add %rax, %r9
965 mov %r11, %rax
966 mov %r10, %r11
967 adc %rdx, %r10
969 mul %rcx
970 add %rax, %r10
971 adc %r11, %rdx
972 add %r8, %r8
973 adc %r9, %r9
974 adc %r10, %r10
975 adc %rdx, %rdx
976 adc %r11, %r11
977 add %r8, 8(rp)
978 adc %r9, 16(rp)
979 adc %r10, 24(rp)
980 adc %rdx, 32(rp)
981 adc %r11, 40(rp)
982 FUNC_EXIT()
984 EPILOGUE()