beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / mulx / adx / addmul_1.asm
blobea607899a4cae65f1322e5a90fe4c8aeb2e9d8ad
1 dnl AMD64 mpn_addmul_1 for CPUs with mulx and adx.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2012, 2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9 -
37 C AMD K10 -
38 C AMD bd1 -
39 C AMD bobcat -
40 C Intel P4 -
41 C Intel PNR -
42 C Intel NHM -
43 C Intel SBR -
44 C Intel HWL -
45 C Intel BWL ?
46 C Intel atom -
47 C VIA nano -
49 define(`rp', `%rdi') dnl rcx
50 define(`up', `%rsi') dnl rdx
51 define(`n_param', `%rdx') dnl r8
52 define(`v0_param',`%rcx') dnl r9
54 define(`n', `%rcx') dnl
55 define(`v0', `%rdx') dnl
57 C Testing mechanism for running this on older AMD64 processors
58 ifelse(FAKE_MULXADX,1,`
59 include(CONFIG_TOP_SRCDIR`/mpn/x86_64/missing-call.m4')
60 ',`
61 define(`adox', ``adox' $1, $2')
62 define(`adcx', ``adcx' $1, $2')
63 define(`mulx', ``mulx' $1, $2, $3')
66 ASM_START()
67 TEXT
68 ALIGN(16)
69 PROLOGUE(mpn_addmul_1)
70 mov (up), %r8
72 push %rbx
73 push %r12
74 push %r13
76 lea (up,n_param,8), up
77 lea -16(rp,n_param,8), rp
78 mov R32(n_param), R32(%rax)
79 xchg v0_param, v0 C FIXME: is this insn fast?
81 neg n
83 and $3, R8(%rax)
84 jz L(b0)
85 cmp $2, R8(%rax)
86 jl L(b1)
87 jz L(b2)
89 L(b3): mulx( (up,n,8), %r11, %r10)
90 mulx( 8(up,n,8), %r13, %r12)
91 mulx( 16(up,n,8), %rbx, %rax)
92 dec n
93 jmp L(lo3)
95 L(b0): mulx( (up,n,8), %r9, %r8)
96 mulx( 8(up,n,8), %r11, %r10)
97 mulx( 16(up,n,8), %r13, %r12)
98 jmp L(lo0)
100 L(b2): mulx( (up,n,8), %r13, %r12)
101 mulx( 8(up,n,8), %rbx, %rax)
102 lea 2(n), n
103 jrcxz L(wd2)
104 L(gt2): mulx( (up,n,8), %r9, %r8)
105 jmp L(lo2)
107 L(b1): and R8(%rax), R8(%rax)
108 mulx( (up,n,8), %rbx, %rax)
109 lea 1(n), n
110 jrcxz L(wd1)
111 mulx( (up,n,8), %r9, %r8)
112 mulx( 8(up,n,8), %r11, %r10)
113 jmp L(lo1)
115 L(end): adcx( %r10, %r13)
116 mov %r11, -8(rp)
117 L(wd2): adox( (rp), %r13)
118 adcx( %r12, %rbx)
119 mov %r13, (rp)
120 L(wd1): adox( 8(rp), %rbx)
121 adcx( %rcx, %rax)
122 adox( %rcx, %rax)
123 mov %rbx, 8(rp)
124 pop %r13
125 pop %r12
126 pop %rbx
129 L(top): jrcxz L(end)
130 mulx( (up,n,8), %r9, %r8)
131 adcx( %r10, %r13)
132 mov %r11, -8(rp,n,8)
133 L(lo2): adox( (rp,n,8), %r13)
134 mulx( 8(up,n,8), %r11, %r10)
135 adcx( %r12, %rbx)
136 mov %r13, (rp,n,8)
137 L(lo1): adox( 8(rp,n,8), %rbx)
138 mulx( 16(up,n,8), %r13, %r12)
139 adcx( %rax, %r9)
140 mov %rbx, 8(rp,n,8)
141 L(lo0): adox( 16(rp,n,8), %r9)
142 mulx( 24(up,n,8), %rbx, %rax)
143 adcx( %r8, %r11)
144 mov %r9, 16(rp,n,8)
145 L(lo3): adox( 24(rp,n,8), %r11)
146 lea 4(n), n
147 jmp L(top)
148 EPILOGUE()
149 ASM_END()