beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreisbr / mul_basecase.asm
blob35fd1cc003c2579af16eaffe253c11b42b6261f8
1 dnl AMD64 mpn_mul_basecase optimised for Intel Sandy bridge and Ivy bridge.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb mul_1 mul_2 mul_3 addmul_2
36 C AMD K8,K9
37 C AMD K10
38 C AMD bull
39 C AMD pile
40 C AMD steam
41 C AMD bobcat
42 C AMD jaguar
43 C Intel P4
44 C Intel core
45 C Intel NHM
46 C Intel SBR 2.5 2.5 - 2.95
47 C Intel IBR 2.4 2.3 - 2.68
48 C Intel HWL 2.35 2.0 - 2.5
49 C Intel BWL
50 C Intel atom
51 C VIA nano
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
56 C TODO
57 C * Fix the addmul_2 fluctuation affecting SBR.
58 C * Improve feed-in code, avoiding zeroing of many registers and dummy adds in
59 C the loops at the expense of code size.
60 C * Adjoin a mul_3, avoiding slow mul_1 for odd vn.
61 C * Consider replacing the 2-way mul_2 code with 4-way code, for a very slight
62 C speedup.
63 C * Further micro-optimise.
65 C When playing with pointers, set this to $2 to fall back to conservative
66 C indexing in wind-down code.
67 define(`I',`$1')
70 define(`rp', `%rdi')
71 define(`up', `%rsi')
72 define(`un_param',`%rdx')
73 define(`vp', `%rcx')
74 define(`vn', `%r8')
76 define(`un', `%rbx')
78 define(`w0', `%r10')
79 define(`w1', `%r11')
80 define(`w2', `%r12')
81 define(`w3', `%r13')
82 define(`n', `%rbp')
83 define(`v0', `%r9')
85 ABI_SUPPORT(DOS64)
86 ABI_SUPPORT(STD64)
88 ASM_START()
89 TEXT
90 ALIGN(16)
91 PROLOGUE(mpn_mul_basecase)
92 FUNC_ENTRY(4)
93 IFDOS(` mov 56(%rsp), %r8d ')
94 push %rbx
95 push %rbp
96 mov un_param, un C free up rdx
97 neg un
99 mov (up), %rax C shared for mul_1 and mul_2
100 lea (up,un_param,8), up C point at operand end
101 lea (rp,un_param,8), rp C point at rp[un-1]
103 mov (vp), v0 C shared for mul_1 and mul_2
104 mul v0 C shared for mul_1 and mul_2
106 test $1, R8(vn)
107 jz L(do_mul_2)
109 L(do_mul_1):
110 test $1, R8(un)
111 jnz L(m1x1)
113 L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ...
114 mov %rdx, w1
115 mov 8(up,un,8), %rax
116 test $2, R8(un)
117 jnz L(m110)
119 L(m100):lea 2(un), n C un = 4, 8, 12, ...
120 jmp L(m1l0)
122 L(m110):lea (un), n C un = 2, 6, 10, ...
123 jmp L(m1l2)
125 L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ...
126 mov %rdx, w0
127 test $2, R8(un)
128 jz L(m111)
130 L(m101):lea 3(un), n C un = 1, 5, 9, ...
131 test n, n
132 js L(m1l1)
133 mov %rax, -8(rp)
134 mov %rdx, (rp)
135 pop %rbp
136 pop %rbx
137 FUNC_EXIT()
140 L(m111):lea 1(un), n C un = 3, 7, 11, ...
141 mov 8(up,un,8), %rax
142 jmp L(m1l3)
144 ALIGN(16) C FIXME
145 L(m1tp):mov %rdx, w0
146 add %rax, w1
147 L(m1l1):mov -16(up,n,8), %rax
148 adc $0, w0
149 mul v0
150 add %rax, w0
151 mov w1, -24(rp,n,8)
152 mov -8(up,n,8), %rax
153 mov %rdx, w1
154 adc $0, w1
155 L(m1l0):mul v0
156 mov w0, -16(rp,n,8)
157 add %rax, w1
158 mov %rdx, w0
159 mov (up,n,8), %rax
160 adc $0, w0
161 L(m1l3):mul v0
162 mov w1, -8(rp,n,8)
163 mov %rdx, w1
164 add %rax, w0
165 mov 8(up,n,8), %rax
166 adc $0, w1
167 L(m1l2):mul v0
168 mov w0, (rp,n,8)
169 add $4, n
170 jnc L(m1tp)
172 L(m1ed):add %rax, w1
173 adc $0, %rdx
174 mov w1, I(-8(rp),-24(rp,n,8))
175 mov %rdx, I((rp),-16(rp,n,8))
177 dec R32(vn)
178 jz L(ret2)
180 lea 8(vp), vp
181 lea 8(rp), rp
182 push %r12
183 push %r13
184 push %r14
185 jmp L(do_addmul)
187 L(do_mul_2):
188 define(`v1', `%r14')
189 push %r12
190 push %r13
191 push %r14
193 mov 8(vp), v1
195 test $1, R8(un)
196 jnz L(m2b1)
198 L(m2b0):lea (un), n
199 xor w0, w0
200 mov %rax, w2
201 mov %rdx, w1
202 jmp L(m2l0)
204 L(m2b1):lea 1(un), n
205 xor w1, w1
206 xor w2, w2
207 mov %rax, w0
208 mov %rdx, w3
209 jmp L(m2l1)
211 ALIGN(32)
212 L(m2tp):mul v0
213 add %rax, w0
214 mov %rdx, w3
215 adc $0, w3
216 L(m2l1):mov -8(up,n,8), %rax
217 mul v1
218 add w1, w0
219 adc $0, w3
220 add %rax, w2
221 mov w0, -8(rp,n,8)
222 mov %rdx, w0
223 adc $0, w0
224 mov (up,n,8), %rax
225 mul v0
226 add %rax, w2
227 mov %rdx, w1
228 adc $0, w1
229 add w3, w2
230 L(m2l0):mov (up,n,8), %rax
231 adc $0, w1
232 mul v1
233 mov w2, (rp,n,8)
234 add %rax, w0
235 mov %rdx, w2
236 mov 8(up,n,8), %rax
237 adc $0, w2
238 add $2, n
239 jnc L(m2tp)
241 L(m2ed):mul v0
242 add %rax, w0
243 mov %rdx, w3
244 adc $0, w3
245 mov I(-8(up),-8(up,n,8)), %rax
246 mul v1
247 add w1, w0
248 adc $0, w3
249 add %rax, w2
250 mov w0, I(-8(rp),-8(rp,n,8))
251 adc $0, %rdx
252 add w3, w2
253 mov w2, I((rp),(rp,n,8))
254 adc $0, %rdx
255 mov %rdx, I(8(rp),8(rp,n,8))
257 add $-2, R32(vn)
258 jz L(ret5)
259 lea 16(vp), vp
260 lea 16(rp), rp
263 L(do_addmul):
264 push %r15
265 push vn C save vn in new stack slot
266 define(`vn', `(%rsp)')
267 define(`X0', `%r14')
268 define(`X1', `%r15')
269 define(`v1', `%r8')
271 L(outer):
272 mov (vp), v0
273 mov 8(vp), v1
274 mov (up,un,8), %rax
275 mul v0
276 test $1, R8(un)
277 jnz L(a1x1)
279 L(a1x0):mov (rp,un,8), X0
280 xor w0, w0
281 mov %rdx, w1
282 test $2, R8(un)
283 jnz L(a110)
285 L(a100):lea 2(un), n C un = 4, 8, 12, ...
286 add %rax, X0
287 adc $0, w1
288 mov (up,un,8), %rax
289 mul v1
290 mov 8(rp,un,8), X1
291 jmp L(lo0)
293 L(a110):lea (un), n C un = 2, 6, 10, ...
294 xor w3, w3
295 jmp L(lo2)
297 L(a1x1):mov (rp,un,8), X1
298 xor w2, w2
299 xor w1, w1
300 test $2, R8(un)
301 jz L(a111)
303 L(a101):lea 3(un), n C un = 1, 5, 9, ...
304 mov %rdx, w3
305 add %rax, X1
306 mov (up,un,8), %rax
307 mov 8(rp,un,8), X0
308 adc $0, w3
309 jmp L(top)
311 L(a111):lea 1(un), n C un = 3, 7, 11, ...
312 jmp L(lo3)
314 ALIGN(32)
315 L(top): mul v1
316 mov %rdx, w0
317 add %rax, X0
318 adc $0, w0
319 add w1, X1
320 adc $0, w3
321 add w2, X0
322 adc $0, w0
323 mov -16(up,n,8), %rax
324 mul v0
325 add %rax, X0
326 mov %rdx, w1
327 adc $0, w1
328 mov -16(up,n,8), %rax
329 mul v1
330 mov X1, -24(rp,n,8)
331 mov -8(rp,n,8), X1
332 add w3, X0
333 adc $0, w1
334 L(lo0): mov %rdx, w2
335 mov X0, -16(rp,n,8)
336 add %rax, X1
337 adc $0, w2
338 mov -8(up,n,8), %rax
339 add w0, X1
340 adc $0, w2
341 mul v0
342 L(lo3): add %rax, X1
343 mov %rdx, w3
344 adc $0, w3
345 mov -8(up,n,8), %rax
346 mul v1
347 add w1, X1
348 mov (rp,n,8), X0
349 adc $0, w3
350 mov %rdx, w0
351 add %rax, X0
352 adc $0, w0
353 mov (up,n,8), %rax
354 mul v0
355 add w2, X0
356 mov X1, -8(rp,n,8)
357 mov %rdx, w1
358 adc $0, w0
359 L(lo2): add %rax, X0
360 adc $0, w1
361 mov (up,n,8), %rax
362 add w3, X0
363 adc $0, w1
364 mul v1
365 mov 8(rp,n,8), X1
366 add %rax, X1
367 mov %rdx, w2
368 adc $0, w2
369 mov 8(up,n,8), %rax
370 mov X0, (rp,n,8)
371 mul v0
372 add w0, X1
373 mov %rdx, w3
374 adc $0, w2
375 add %rax, X1
376 mov 8(up,n,8), %rax
377 mov 16(rp,n,8), X0 C useless but harmless in final iter
378 adc $0, w3
379 add $4, n
380 jnc L(top)
382 L(end): mul v1
383 add w1, X1
384 adc $0, w3
385 add w2, %rax
386 adc $0, %rdx
387 mov X1, I(-8(rp),-24(rp,n,8))
388 add w3, %rax
389 adc $0, %rdx
390 mov %rax, I((rp),-16(rp,n,8))
391 mov %rdx, I(8(rp),-8(rp,n,8))
393 addl $-2, vn
394 lea 16(vp), vp
395 lea 16(rp), rp
396 jnz L(outer)
398 pop %rax C deallocate vn slot
399 pop %r15
400 L(ret5):pop %r14
401 pop %r13
402 pop %r12
403 L(ret2):pop %rbp
404 pop %rbx
405 FUNC_EXIT()
407 EPILOGUE()