beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / k8 / sqr_basecase.asm
blob60cf945a4653a136cec12e77e48c13203f9bb56f
1 dnl AMD64 mpn_sqr_basecase.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C The inner loops of this code are the result of running a code generation and
36 C optimization tool suite written by David Harvey and Torbjorn Granlund.
38 C NOTES
39 C * There is a major stupidity in that we call mpn_mul_1 initially, for a
40 C large trip count. Instead, we should follow the generic/sqr_basecase.c
41 C code which uses addmul_2s from the start, conditionally leaving a 1x1
42 C multiply to the end. (In assembly code, one would stop invoking
43 C addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
44 C * Another stupidity is in the sqr_diag_addlsh1 code. It does not need to
45 C save/restore carry, instead it can propagate into the high product word.
46 C * Align more labels, should shave off a few cycles.
47 C * We can safely use 32-bit size operations, since operands with (2^32)
48 C limbs will lead to non-termination in practice.
49 C * The jump table could probably be optimized, at least for non-pic.
50 C * The special code for n <= 4 was quickly written. It is probably too
51 C large and unnecessarily slow.
52 C * Consider combining small cases code so that the n=k-1 code jumps into the
53 C middle of the n=k code.
54 C * Avoid saving registers for small cases code.
55 C * Needed variables:
56 C n r11 input size
57 C i r8 work left, initially n
58 C j r9 inner loop count
59 C r15 unused
60 C v0 r13
61 C v1 r14
62 C rp rdi
63 C up rsi
64 C w0 rbx
65 C w1 rcx
66 C w2 rbp
67 C w3 r10
68 C tp r12
69 C lo rax
70 C hi rdx
71 C rsp
73 C INPUT PARAMETERS
74 define(`rp', `%rdi')
75 define(`up', `%rsi')
76 define(`n_param', `%rdx')
78 define(`n', `%r11')
79 define(`tp', `%r12')
80 define(`i', `%r8')
81 define(`j', `%r9')
82 define(`v0', `%r13')
83 define(`v1', `%r14')
84 define(`w0', `%rbx')
85 define(`w1', `%rcx')
86 define(`w2', `%rbp')
87 define(`w3', `%r10')
89 ABI_SUPPORT(DOS64)
90 ABI_SUPPORT(STD64)
92 ASM_START()
93 TEXT
94 ALIGN(16)
95 PROLOGUE(mpn_sqr_basecase)
96 FUNC_ENTRY(3)
97 mov R32(n_param), R32(%rcx)
98 mov R32(n_param), R32(n) C free original n register (rdx)
100 add $-40, %rsp
102 and $3, R32(%rcx)
103 cmp $4, R32(n_param)
104 lea 4(%rcx), %r8
106 mov %rbx, 32(%rsp)
107 mov %rbp, 24(%rsp)
108 mov %r12, 16(%rsp)
109 mov %r13, 8(%rsp)
110 mov %r14, (%rsp)
112 cmovg %r8, %rcx
114 lea L(tab)(%rip), %rax
115 ifdef(`PIC',
116 ` movslq (%rax,%rcx,4), %r10
117 add %r10, %rax
118 jmp *%rax
120 jmp *(%rax,%rcx,8)
122 JUMPTABSECT
123 ALIGN(8)
124 L(tab): JMPENT( L(4), L(tab))
125 JMPENT( L(1), L(tab))
126 JMPENT( L(2), L(tab))
127 JMPENT( L(3), L(tab))
128 JMPENT( L(0m4), L(tab))
129 JMPENT( L(1m4), L(tab))
130 JMPENT( L(2m4), L(tab))
131 JMPENT( L(3m4), L(tab))
132 TEXT
134 L(1): mov (up), %rax
135 mul %rax
136 add $40, %rsp
137 mov %rax, (rp)
138 mov %rdx, 8(rp)
139 FUNC_EXIT()
142 L(2): mov (up), %rax
143 mov %rax, %r8
144 mul %rax
145 mov 8(up), %r11
146 mov %rax, (rp)
147 mov %r11, %rax
148 mov %rdx, %r9
149 mul %rax
150 add $40, %rsp
151 mov %rax, %r10
152 mov %r11, %rax
153 mov %rdx, %r11
154 mul %r8
155 xor %r8, %r8
156 add %rax, %r9
157 adc %rdx, %r10
158 adc %r8, %r11
159 add %rax, %r9
160 mov %r9, 8(rp)
161 adc %rdx, %r10
162 mov %r10, 16(rp)
163 adc %r8, %r11
164 mov %r11, 24(rp)
165 FUNC_EXIT()
168 L(3): mov (up), %rax
169 mov %rax, %r10
170 mul %rax
171 mov 8(up), %r11
172 mov %rax, (rp)
173 mov %r11, %rax
174 mov %rdx, 8(rp)
175 mul %rax
176 mov 16(up), %rcx
177 mov %rax, 16(rp)
178 mov %rcx, %rax
179 mov %rdx, 24(rp)
180 mul %rax
181 mov %rax, 32(rp)
182 mov %rdx, 40(rp)
184 mov %r11, %rax
185 mul %r10
186 mov %rax, %r8
187 mov %rcx, %rax
188 mov %rdx, %r9
189 mul %r10
190 xor %r10, %r10
191 add %rax, %r9
192 mov %r11, %rax
193 mov %r10, %r11
194 adc %rdx, %r10
196 mul %rcx
197 add $40, %rsp
198 add %rax, %r10
199 adc %r11, %rdx
200 add %r8, %r8
201 adc %r9, %r9
202 adc %r10, %r10
203 adc %rdx, %rdx
204 adc %r11, %r11
205 add %r8, 8(rp)
206 adc %r9, 16(rp)
207 adc %r10, 24(rp)
208 adc %rdx, 32(rp)
209 adc %r11, 40(rp)
210 FUNC_EXIT()
213 L(4): mov (up), %rax
214 mov %rax, %r11
215 mul %rax
216 mov 8(up), %rbx
217 mov %rax, (rp)
218 mov %rbx, %rax
219 mov %rdx, 8(rp)
220 mul %rax
221 mov %rax, 16(rp)
222 mov %rdx, 24(rp)
223 mov 16(up), %rax
224 mul %rax
225 mov %rax, 32(rp)
226 mov %rdx, 40(rp)
227 mov 24(up), %rax
228 mul %rax
229 mov %rax, 48(rp)
230 mov %rbx, %rax
231 mov %rdx, 56(rp)
233 mul %r11
234 add $32, %rsp
235 mov %rax, %r8
236 mov %rdx, %r9
237 mov 16(up), %rax
238 mul %r11
239 xor %r10, %r10
240 add %rax, %r9
241 adc %rdx, %r10
242 mov 24(up), %rax
243 mul %r11
244 xor %r11, %r11
245 add %rax, %r10
246 adc %rdx, %r11
247 mov 16(up), %rax
248 mul %rbx
249 xor %rcx, %rcx
250 add %rax, %r10
251 adc %rdx, %r11
252 adc $0, %rcx
253 mov 24(up), %rax
254 mul %rbx
255 pop %rbx
256 add %rax, %r11
257 adc %rdx, %rcx
258 mov 16(up), %rdx
259 mov 24(up), %rax
260 mul %rdx
261 add %rax, %rcx
262 adc $0, %rdx
264 add %r8, %r8
265 adc %r9, %r9
266 adc %r10, %r10
267 adc %r11, %r11
268 adc %rcx, %rcx
269 mov $0, R32(%rax)
270 adc %rdx, %rdx
272 adc %rax, %rax
273 add %r8, 8(rp)
274 adc %r9, 16(rp)
275 adc %r10, 24(rp)
276 adc %r11, 32(rp)
277 adc %rcx, 40(rp)
278 adc %rdx, 48(rp)
279 adc %rax, 56(rp)
280 FUNC_EXIT()
284 L(0m4):
285 lea -16(rp,n,8), tp C point tp in middle of result operand
286 mov (up), v0
287 mov 8(up), %rax
288 lea (up,n,8), up C point up at end of input operand
290 lea -4(n), i
291 C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1])
292 xor R32(j), R32(j)
293 sub n, j
295 mul v0
296 xor R32(w2), R32(w2)
297 mov %rax, w0
298 mov 16(up,j,8), %rax
299 mov %rdx, w3
300 jmp L(L3)
302 ALIGN(16)
303 L(mul_1_m3_top):
304 add %rax, w2
305 mov w3, (tp,j,8)
306 mov (up,j,8), %rax
307 adc %rdx, w1
308 xor R32(w0), R32(w0)
309 mul v0
310 xor R32(w3), R32(w3)
311 mov w2, 8(tp,j,8)
312 add %rax, w1
313 adc %rdx, w0
314 mov 8(up,j,8), %rax
315 mov w1, 16(tp,j,8)
316 xor R32(w2), R32(w2)
317 mul v0
318 add %rax, w0
319 mov 16(up,j,8), %rax
320 adc %rdx, w3
321 L(L3): xor R32(w1), R32(w1)
322 mul v0
323 add %rax, w3
324 mov 24(up,j,8), %rax
325 adc %rdx, w2
326 mov w0, 24(tp,j,8)
327 mul v0
328 add $4, j
329 js L(mul_1_m3_top)
331 add %rax, w2
332 mov w3, (tp)
333 adc %rdx, w1
334 mov w2, 8(tp)
335 mov w1, 16(tp)
337 lea eval(2*8)(tp), tp C tp += 2
338 lea -8(up), up
339 jmp L(dowhile)
342 L(1m4):
343 lea 8(rp,n,8), tp C point tp in middle of result operand
344 mov (up), v0 C u0
345 mov 8(up), %rax C u1
346 lea 8(up,n,8), up C point up at end of input operand
348 lea -3(n), i
349 C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1)
350 lea -3(n), j
351 neg j
353 mov %rax, v1 C u1
354 mul v0 C u0 * u1
355 mov %rdx, w1
356 xor R32(w2), R32(w2)
357 mov %rax, 8(rp)
358 jmp L(m0)
360 ALIGN(16)
361 L(mul_2_m0_top):
362 mul v1
363 add %rax, w0
364 adc %rdx, w1
365 mov -24(up,j,8), %rax
366 mov $0, R32(w2)
367 mul v0
368 add %rax, w0
369 mov -24(up,j,8), %rax
370 adc %rdx, w1
371 adc $0, R32(w2)
372 mul v1 C v1 * u0
373 add %rax, w1
374 mov w0, -24(tp,j,8)
375 adc %rdx, w2
376 L(m0): mov -16(up,j,8), %rax C u2, u6 ...
377 mul v0 C u0 * u2
378 mov $0, R32(w3)
379 add %rax, w1
380 adc %rdx, w2
381 mov -16(up,j,8), %rax
382 adc $0, R32(w3)
383 mov $0, R32(w0)
384 mov w1, -16(tp,j,8)
385 mul v1
386 add %rax, w2
387 mov -8(up,j,8), %rax
388 adc %rdx, w3
389 mov $0, R32(w1)
390 mul v0
391 add %rax, w2
392 mov -8(up,j,8), %rax
393 adc %rdx, w3
394 adc $0, R32(w0)
395 mul v1
396 add %rax, w3
397 mov w2, -8(tp,j,8)
398 adc %rdx, w0
399 L(m2x): mov (up,j,8), %rax
400 mul v0
401 add %rax, w3
402 adc %rdx, w0
403 adc $0, R32(w1)
404 add $4, j
405 mov -32(up,j,8), %rax
406 mov w3, -32(tp,j,8)
407 js L(mul_2_m0_top)
409 mul v1
410 add %rax, w0
411 adc %rdx, w1
412 mov w0, -8(tp)
413 mov w1, (tp)
415 lea -16(up), up
416 lea eval(3*8-24)(tp), tp C tp += 3
417 jmp L(dowhile_end)
420 L(2m4):
421 lea -16(rp,n,8), tp C point tp in middle of result operand
422 mov (up), v0
423 mov 8(up), %rax
424 lea (up,n,8), up C point up at end of input operand
426 lea -4(n), i
427 C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i])
428 lea -2(n), j
429 neg j
431 mul v0
432 mov %rax, w2
433 mov (up,j,8), %rax
434 mov %rdx, w1
435 jmp L(L1)
437 ALIGN(16)
438 L(mul_1_m1_top):
439 add %rax, w2
440 mov w3, (tp,j,8)
441 mov (up,j,8), %rax
442 adc %rdx, w1
443 L(L1): xor R32(w0), R32(w0)
444 mul v0
445 xor R32(w3), R32(w3)
446 mov w2, 8(tp,j,8)
447 add %rax, w1
448 adc %rdx, w0
449 mov 8(up,j,8), %rax
450 mov w1, 16(tp,j,8)
451 xor R32(w2), R32(w2)
452 mul v0
453 add %rax, w0
454 mov 16(up,j,8), %rax
455 adc %rdx, w3
456 xor R32(w1), R32(w1)
457 mul v0
458 add %rax, w3
459 mov 24(up,j,8), %rax
460 adc %rdx, w2
461 mov w0, 24(tp,j,8)
462 mul v0
463 add $4, j
464 js L(mul_1_m1_top)
466 add %rax, w2
467 mov w3, (tp)
468 adc %rdx, w1
469 mov w2, 8(tp)
470 mov w1, 16(tp)
472 lea eval(2*8)(tp), tp C tp += 2
473 lea -8(up), up
474 jmp L(dowhile_mid)
477 L(3m4):
478 lea 8(rp,n,8), tp C point tp in middle of result operand
479 mov (up), v0 C u0
480 mov 8(up), %rax C u1
481 lea 8(up,n,8), up C point up at end of input operand
483 lea -5(n), i
484 C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i)
485 lea -1(n), j
486 neg j
488 mov %rax, v1 C u1
489 mul v0 C u0 * u1
490 mov %rdx, w3
491 xor R32(w0), R32(w0)
492 xor R32(w1), R32(w1)
493 mov %rax, 8(rp)
494 jmp L(m2)
496 ALIGN(16)
497 L(mul_2_m2_top):
498 mul v1
499 add %rax, w0
500 adc %rdx, w1
501 mov -24(up,j,8), %rax
502 mov $0, R32(w2)
503 mul v0
504 add %rax, w0
505 mov -24(up,j,8), %rax
506 adc %rdx, w1
507 adc $0, R32(w2)
508 mul v1 C v1 * u0
509 add %rax, w1
510 mov w0, -24(tp,j,8)
511 adc %rdx, w2
512 mov -16(up,j,8), %rax
513 mul v0
514 mov $0, R32(w3)
515 add %rax, w1
516 adc %rdx, w2
517 mov -16(up,j,8), %rax
518 adc $0, R32(w3)
519 mov $0, R32(w0)
520 mov w1, -16(tp,j,8)
521 mul v1
522 add %rax, w2
523 mov -8(up,j,8), %rax
524 adc %rdx, w3
525 mov $0, R32(w1)
526 mul v0
527 add %rax, w2
528 mov -8(up,j,8), %rax
529 adc %rdx, w3
530 adc $0, R32(w0)
531 mul v1
532 add %rax, w3
533 mov w2, -8(tp,j,8)
534 adc %rdx, w0
535 L(m2): mov (up,j,8), %rax
536 mul v0
537 add %rax, w3
538 adc %rdx, w0
539 adc $0, R32(w1)
540 add $4, j
541 mov -32(up,j,8), %rax
542 mov w3, -32(tp,j,8)
543 js L(mul_2_m2_top)
545 mul v1
546 add %rax, w0
547 adc %rdx, w1
548 mov w0, -8(tp)
549 mov w1, (tp)
551 lea -16(up), up
552 jmp L(dowhile_mid)
554 L(dowhile):
555 C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i)
556 lea 4(i), j
557 neg j
559 mov 16(up,j,8), v0
560 mov 24(up,j,8), v1
561 mov 24(up,j,8), %rax
562 mul v0
563 xor R32(w3), R32(w3)
564 add %rax, 24(tp,j,8)
565 adc %rdx, w3
566 xor R32(w0), R32(w0)
567 xor R32(w1), R32(w1)
568 jmp L(am2)
570 ALIGN(16)
571 L(addmul_2_m2_top):
572 add w3, (tp,j,8)
573 adc %rax, w0
574 mov 8(up,j,8), %rax
575 adc %rdx, w1
576 mov $0, R32(w2)
577 mul v0
578 add %rax, w0
579 mov 8(up,j,8), %rax
580 adc %rdx, w1
581 adc $0, R32(w2)
582 mul v1 C v1 * u0
583 add w0, 8(tp,j,8)
584 adc %rax, w1
585 adc %rdx, w2
586 mov 16(up,j,8), %rax
587 mov $0, R32(w3)
588 mul v0 C v0 * u1
589 add %rax, w1
590 mov 16(up,j,8), %rax
591 adc %rdx, w2
592 adc $0, R32(w3)
593 mul v1 C v1 * u1
594 add w1, 16(tp,j,8)
595 adc %rax, w2
596 mov 24(up,j,8), %rax
597 adc %rdx, w3
598 mul v0
599 mov $0, R32(w0)
600 add %rax, w2
601 adc %rdx, w3
602 mov $0, R32(w1)
603 mov 24(up,j,8), %rax
604 adc $0, R32(w0)
605 mul v1
606 add w2, 24(tp,j,8)
607 adc %rax, w3
608 adc %rdx, w0
609 L(am2): mov 32(up,j,8), %rax
610 mul v0
611 add %rax, w3
612 mov 32(up,j,8), %rax
613 adc %rdx, w0
614 adc $0, R32(w1)
615 mul v1
616 add $4, j
617 js L(addmul_2_m2_top)
619 add w3, (tp)
620 adc %rax, w0
621 adc %rdx, w1
622 mov w0, 8(tp)
623 mov w1, 16(tp)
625 lea eval(2*8)(tp), tp C tp += 2
627 add $-2, R32(i) C i -= 2
629 L(dowhile_mid):
630 C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i)
631 lea 2(i), j
632 neg j
634 mov (up,j,8), v0
635 mov 8(up,j,8), v1
636 mov 8(up,j,8), %rax
637 mul v0
638 xor R32(w1), R32(w1)
639 add %rax, 8(tp,j,8)
640 adc %rdx, w1
641 xor R32(w2), R32(w2)
642 jmp L(20)
644 ALIGN(16)
645 L(addmul_2_m0_top):
646 add w3, (tp,j,8)
647 adc %rax, w0
648 mov 8(up,j,8), %rax
649 adc %rdx, w1
650 mov $0, R32(w2)
651 mul v0
652 add %rax, w0
653 mov 8(up,j,8), %rax
654 adc %rdx, w1
655 adc $0, R32(w2)
656 mul v1 C v1 * u0
657 add w0, 8(tp,j,8)
658 adc %rax, w1
659 adc %rdx, w2
660 L(20): mov 16(up,j,8), %rax
661 mov $0, R32(w3)
662 mul v0 C v0 * u1
663 add %rax, w1
664 mov 16(up,j,8), %rax
665 adc %rdx, w2
666 adc $0, R32(w3)
667 mul v1 C v1 * u1
668 add w1, 16(tp,j,8)
669 adc %rax, w2
670 mov 24(up,j,8), %rax
671 adc %rdx, w3
672 mul v0
673 mov $0, R32(w0)
674 add %rax, w2
675 adc %rdx, w3
676 mov $0, R32(w1)
677 mov 24(up,j,8), %rax
678 adc $0, R32(w0)
679 mul v1
680 add w2, 24(tp,j,8)
681 adc %rax, w3
682 adc %rdx, w0
683 mov 32(up,j,8), %rax
684 mul v0
685 add %rax, w3
686 mov 32(up,j,8), %rax
687 adc %rdx, w0
688 adc $0, R32(w1)
689 mul v1
690 add $4, j
691 js L(addmul_2_m0_top)
693 add w3, (tp)
694 adc %rax, w0
695 adc %rdx, w1
696 mov w0, 8(tp)
697 mov w1, 16(tp)
699 lea eval(2*8)(tp), tp C tp += 2
700 L(dowhile_end):
702 add $-2, R32(i) C i -= 2
703 jne L(dowhile)
705 C Function mpn_addmul_2s_2
706 mov -16(up), v0
707 mov -8(up), v1
708 mov -8(up), %rax
709 mul v0
710 xor R32(w3), R32(w3)
711 add %rax, -8(tp)
712 adc %rdx, w3
713 xor R32(w0), R32(w0)
714 xor R32(w1), R32(w1)
715 mov (up), %rax
716 mul v0
717 add %rax, w3
718 mov (up), %rax
719 adc %rdx, w0
720 mul v1
721 add w3, (tp)
722 adc %rax, w0
723 adc %rdx, w1
724 mov w0, 8(tp)
725 mov w1, 16(tp)
727 C Function mpn_sqr_diag_addlsh1
728 lea -4(n,n), j
730 mov 8(rp), %r11
731 lea -8(up), up
732 lea (rp,j,8), rp
733 neg j
734 mov (up,j,4), %rax
735 mul %rax
736 test $2, R8(j)
737 jnz L(odd)
739 L(evn): add %r11, %r11
740 sbb R32(%rbx), R32(%rbx) C save CF
741 add %rdx, %r11
742 mov %rax, (rp,j,8)
743 jmp L(d0)
745 L(odd): add %r11, %r11
746 sbb R32(%rbp), R32(%rbp) C save CF
747 add %rdx, %r11
748 mov %rax, (rp,j,8)
749 lea -2(j), j
750 jmp L(d1)
752 ALIGN(16)
753 L(top): mov (up,j,4), %rax
754 mul %rax
755 add R32(%rbp), R32(%rbp) C restore carry
756 adc %rax, %r10
757 adc %rdx, %r11
758 mov %r10, (rp,j,8)
759 L(d0): mov %r11, 8(rp,j,8)
760 mov 16(rp,j,8), %r10
761 adc %r10, %r10
762 mov 24(rp,j,8), %r11
763 adc %r11, %r11
765 sbb R32(%rbp), R32(%rbp) C save CF
766 mov 8(up,j,4), %rax
767 mul %rax
768 add R32(%rbx), R32(%rbx) C restore carry
769 adc %rax, %r10
770 adc %rdx, %r11
771 mov %r10, 16(rp,j,8)
772 L(d1): mov %r11, 24(rp,j,8)
773 mov 32(rp,j,8), %r10
774 adc %r10, %r10
775 mov 40(rp,j,8), %r11
776 adc %r11, %r11
777 sbb R32(%rbx), R32(%rbx) C save CF
778 add $4, j
779 js L(top)
781 mov (up), %rax
782 mul %rax
783 add R32(%rbp), R32(%rbp) C restore carry
784 adc %rax, %r10
785 adc %rdx, %r11
786 mov %r10, (rp)
787 mov %r11, 8(rp)
788 mov 16(rp), %r10
789 adc %r10, %r10
790 sbb R32(%rbp), R32(%rbp) C save CF
791 neg R32(%rbp)
792 mov 8(up), %rax
793 mul %rax
794 add R32(%rbx), R32(%rbx) C restore carry
795 adc %rax, %r10
796 adc %rbp, %rdx
797 mov %r10, 16(rp)
798 mov %rdx, 24(rp)
800 pop %r14
801 pop %r13
802 pop %r12
803 pop %rbp
804 pop %rbx
805 FUNC_EXIT()
807 EPILOGUE()