beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreisbr / mul_2.asm
blobffee78a385bc06d62a934d1203e5859d0e05ff3f
1 dnl AMD64 mpn_mul_2 optimised for Intel Sandy Bridge.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb best
36 C AMD K8,K9
37 C AMD K10
38 C AMD bull
39 C AMD pile
40 C AMD bobcat
41 C AMD jaguar
42 C Intel P4
43 C Intel core
44 C Intel NHM
45 C Intel SBR 2.57 2.52 using 4-way code
46 C Intel IBR 2.35 2.32 using 4-way code
47 C Intel HWL 2.02 1.86
48 C Intel BWL
49 C Intel atom
50 C VIA nano
52 C This code is the result of running a code generation and optimisation tool
53 C suite written by David Harvey and Torbjorn Granlund.
55 C When playing with pointers, set this to $2 to fall back to conservative
56 C indexing in wind-down code.
57 define(`I',`$1')
59 define(`rp', `%rdi') C rcx
60 define(`up', `%rsi') C rdx
61 define(`n_param', `%rdx') C r8
62 define(`vp', `%rcx') C r9
64 define(`n', `%rcx')
65 define(`v0', `%rbx')
66 define(`v1', `%rbp')
68 define(`w0', `%r8')
69 define(`w1', `%r9')
70 define(`w2', `%r10')
71 define(`w3', `%r11')
73 ABI_SUPPORT(DOS64)
74 ABI_SUPPORT(STD64)
76 ASM_START()
77 TEXT
78 ALIGN(32)
79 PROLOGUE(mpn_mul_2)
80 FUNC_ENTRY(4)
81 push %rbx
82 push %rbp
84 mov (vp), v0
85 mov 8(vp), v1
87 mov (up), %rax
88 lea (up,n_param,8), up
89 lea (rp,n_param,8), rp
91 test $1, R8(n_param)
92 jnz L(b1)
94 L(b0): mov $0, R32(n)
95 sub n_param, n
96 xor w0, w0
97 mul v0
98 mov %rax, w2
99 mov %rdx, w1
100 mov (up,n,8), %rax
101 jmp L(lo0)
103 L(b1): mov $1, R32(n)
104 sub n_param, n
105 xor w2, w2
106 mul v0
107 mov %rax, w0
108 mov %rdx, w3
109 mov -8(up,n,8), %rax
110 mul v1
111 jmp L(lo1)
113 ALIGN(32)
114 L(top): mul v0
115 add %rax, w0 C 1
116 mov %rdx, w3 C 2
117 adc $0, w3 C 2
118 mov -8(up,n,8), %rax
119 mul v1
120 add w1, w0 C 1
121 adc $0, w3 C 2
122 L(lo1): add %rax, w2 C 2
123 mov w0, -8(rp,n,8) C 1
124 mov %rdx, w0 C 3
125 adc $0, w0 C 3
126 mov (up,n,8), %rax
127 mul v0
128 add %rax, w2 C 2
129 mov %rdx, w1 C 3
130 adc $0, w1 C 3
131 add w3, w2 C 2
132 mov (up,n,8), %rax
133 adc $0, w1 C 1
134 L(lo0): mul v1
135 mov w2, (rp,n,8) C 2
136 add %rax, w0 C 3
137 mov %rdx, w2 C 4
138 mov 8(up,n,8), %rax
139 adc $0, w2 C 4
140 add $2, n
141 jnc L(top)
143 L(end): mul v0
144 add %rax, w0
145 mov %rdx, w3
146 adc $0, w3
147 mov I(-8(up),-8(up,n,8)), %rax
148 mul v1
149 add w1, w0
150 adc $0, w3
151 add %rax, w2
152 mov w0, I(-8(rp),-8(rp,n,8))
153 adc $0, %rdx
154 add w3, w2
155 mov w2, I((rp),(rp,n,8))
156 adc $0, %rdx
157 mov %rdx, %rax
159 pop %rbp
160 pop %rbx
161 FUNC_EXIT()
163 EPILOGUE()