beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreibwl / addmul_1.asm
blobaaa58e725189cb787952f8c846701a68a6cf92a9
1 dnl AMD64 mpn_addmul_1 optimised for Intel Broadwell.
3 dnl Copyright 2015 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C AMD K8,K9 n/a
35 C AMD K10 n/a
36 C AMD bull n/a
37 C AMD pile n/a
38 C AMD steam n/a
39 C AMD excavator ?
40 C AMD bobcat n/a
41 C AMD jaguar n/a
42 C Intel P4 n/a
43 C Intel core2 n/a
44 C Intel NHM n/a
45 C Intel SBR n/a
46 C Intel IBR n/a
47 C Intel HWL n/a
48 C Intel BWL 1.8-1.9
49 C Intel atom n/a
50 C Intel SLM n/a
51 C VIA nano n/a
53 C The loop of this code is the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
56 C TODO
57 C * Put an initial mulx before switching, targeting some free registers.
58 C * Tune feed-in code.
59 C * Trim nop execution after L(f2).
60 C * Port to DOS64, not forgetting nop execution.
62 define(`rp', `%rdi') C rcx
63 define(`up', `%rsi') C rdx
64 define(`n_param', `%rdx') C r8
65 define(`v0_param',`%rcx') C r9
67 define(`n', `%rcx')
69 dnl ABI_SUPPORT(DOS64)
70 ABI_SUPPORT(STD64)
72 dnl IFDOS(` define(`up', ``%rsi'') ') dnl
73 dnl IFDOS(` define(`rp', ``%rcx'') ') dnl
74 dnl IFDOS(` define(`vl', ``%r9'') ') dnl
75 dnl IFDOS(` define(`r9', ``rdi'') ') dnl
76 dnl IFDOS(` define(`n', ``%r8'') ') dnl
77 dnl IFDOS(` define(`r8', ``r11'') ') dnl
79 ASM_START()
80 TEXT
81 ALIGN(32)
82 PROLOGUE(mpn_addmul_1)
84 mov v0_param, %r10
85 mov n_param, n
86 mov R32(n_param), R32(%r8)
87 shr $3, n
88 and $7, R32(%r8) C clear OF, CF as side-effect
89 mov %r10, %rdx
90 lea L(tab)(%rip), %r10
91 ifdef(`PIC',
92 ` movslq (%r10,%r8,4), %r8
93 lea (%r8, %r10), %r10
94 jmp *%r10
95 ',`
96 jmp *(%r10,%r8,8)
98 JUMPTABSECT
99 ALIGN(8)
100 L(tab): JMPENT( L(f0), L(tab))
101 JMPENT( L(f1), L(tab))
102 JMPENT( L(f2), L(tab))
103 JMPENT( L(f3), L(tab))
104 JMPENT( L(f4), L(tab))
105 JMPENT( L(f5), L(tab))
106 JMPENT( L(f6), L(tab))
107 JMPENT( L(f7), L(tab))
108 TEXT
110 L(f0): mulx( (up), %r10, %r8)
111 lea -8(up), up
112 lea -8(rp), rp
113 lea -1(n), n
114 jmp L(b0)
116 L(f3): mulx( (up), %r9, %rax)
117 lea 16(up), up
118 lea -48(rp), rp
119 jmp L(b3)
121 L(f4): mulx( (up), %r10, %r8)
122 lea 24(up), up
123 lea -40(rp), rp
124 jmp L(b4)
126 L(f5): mulx( (up), %r9, %rax)
127 lea 32(up), up
128 lea -32(rp), rp
129 jmp L(b5)
131 L(f6): mulx( (up), %r10, %r8)
132 lea 40(up), up
133 lea -24(rp), rp
134 jmp L(b6)
136 L(f1): mulx( (up), %r9, %rax)
137 jrcxz L(1)
138 jmp L(b1)
139 L(1): add (rp), %r9
140 mov %r9, (rp)
141 adc %rcx, %rax C relies on rcx = 0
144 L(end): adox( (rp), %r9)
145 mov %r9, (rp)
146 adox( %rcx, %rax) C relies on rcx = 0
147 adc %rcx, %rax C relies on rcx = 0
150 ifdef(`PIC',
151 ` nop;nop;nop;nop',
152 ` nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop')
154 L(f2): mulx( (up), %r10, %r8)
155 lea 8(up), up
156 lea 8(rp), rp
157 mulx( (up), %r9, %rax)
159 ALIGN(32)
160 L(top): adox( -8,(rp), %r10)
161 adcx( %r8, %r9)
162 mov %r10, -8(rp)
163 jrcxz L(end)
164 L(b1): mulx( 8,(up), %r10, %r8)
165 adox( (rp), %r9)
166 lea -1(n), n
167 mov %r9, (rp)
168 adcx( %rax, %r10)
169 L(b0): mulx( 16,(up), %r9, %rax)
170 adcx( %r8, %r9)
171 adox( 8,(rp), %r10)
172 mov %r10, 8(rp)
173 L(b7): mulx( 24,(up), %r10, %r8)
174 lea 64(up), up
175 adcx( %rax, %r10)
176 adox( 16,(rp), %r9)
177 mov %r9, 16(rp)
178 L(b6): mulx( -32,(up), %r9, %rax)
179 adox( 24,(rp), %r10)
180 adcx( %r8, %r9)
181 mov %r10, 24(rp)
182 L(b5): mulx( -24,(up), %r10, %r8)
183 adcx( %rax, %r10)
184 adox( 32,(rp), %r9)
185 mov %r9, 32(rp)
186 L(b4): mulx( -16,(up), %r9, %rax)
187 adox( 40,(rp), %r10)
188 adcx( %r8, %r9)
189 mov %r10, 40(rp)
190 L(b3): adox( 48,(rp), %r9)
191 mulx( -8,(up), %r10, %r8)
192 mov %r9, 48(rp)
193 lea 64(rp), rp
194 adcx( %rax, %r10)
195 mulx( (up), %r9, %rax)
196 jmp L(top)
198 L(f7): mulx( (up), %r9, %rax)
199 lea -16(up), up
200 lea -16(rp), rp
201 jmp L(b7)
202 EPILOGUE()
203 ASM_END()