beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / mul_2.asm
blobf408c52250e6612c97b1127410c710cf5b0bc7c4
1 dnl AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and
2 dnl store the result in a third limb vector.
4 dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C cycles/limb
35 C AMD K8,K9 2.275
36 C AMD K10 2.275
37 C Intel P4 13.5
38 C Intel core2 4.0
39 C Intel corei 3.8
40 C Intel atom ?
41 C VIA nano ?
43 C This code is the result of running a code generation and optimization tool
44 C suite written by David Harvey and Torbjorn Granlund.
46 C TODO
47 C * Work on feed-in and wind-down code.
48 C * Convert "mov $0" to "xor".
49 C * Adjust initial lea to save some bytes.
50 C * Perhaps adjust n from n_param&3 value?
51 C * Replace with 2.25 c/l sequence.
53 C INPUT PARAMETERS
54 define(`rp', `%rdi')
55 define(`up', `%rsi')
56 define(`n_param',`%rdx')
57 define(`vp', `%rcx')
59 define(`v0', `%r8')
60 define(`v1', `%r9')
61 define(`w0', `%rbx')
62 define(`w1', `%rcx')
63 define(`w2', `%rbp')
64 define(`w3', `%r10')
65 define(`n', `%r11')
67 ABI_SUPPORT(DOS64)
68 ABI_SUPPORT(STD64)
70 ASM_START()
71 TEXT
72 ALIGN(16)
73 PROLOGUE(mpn_mul_2)
74 FUNC_ENTRY(4)
75 push %rbx
76 push %rbp
78 mov (vp), v0
79 mov 8(vp), v1
81 mov (up), %rax
83 mov n_param, n
84 neg n
85 lea -8(up,n_param,8), up
86 lea -8(rp,n_param,8), rp
88 and $3, R32(n_param)
89 jz L(m2p0)
90 cmp $2, R32(n_param)
91 jc L(m2p1)
92 jz L(m2p2)
93 L(m2p3):
94 mul v0
95 xor R32(w3), R32(w3)
96 mov %rax, w1
97 mov %rdx, w2
98 mov 8(up,n,8), %rax
99 add $-1, n
100 mul v1
101 add %rax, w2
102 jmp L(m23)
103 L(m2p0):
104 mul v0
105 xor R32(w2), R32(w2)
106 mov %rax, w0
107 mov %rdx, w1
108 jmp L(m20)
109 L(m2p1):
110 mul v0
111 xor R32(w3), R32(w3)
112 xor R32(w0), R32(w0)
113 xor R32(w1), R32(w1)
114 add $1, n
115 jmp L(m2top)
116 L(m2p2):
117 mul v0
118 xor R32(w0), R32(w0)
119 xor R32(w1), R32(w1)
120 mov %rax, w2
121 mov %rdx, w3
122 mov 8(up,n,8), %rax
123 add $-2, n
124 jmp L(m22)
127 ALIGN(32)
128 L(m2top):
129 add %rax, w3
130 adc %rdx, w0
131 mov 0(up,n,8), %rax
132 adc $0, R32(w1)
133 mov $0, R32(w2)
134 mul v1
135 add %rax, w0
136 mov w3, 0(rp,n,8)
137 adc %rdx, w1
138 mov 8(up,n,8), %rax
139 mul v0
140 add %rax, w0
141 adc %rdx, w1
142 adc $0, R32(w2)
143 L(m20): mov 8(up,n,8), %rax
144 mul v1
145 add %rax, w1
146 adc %rdx, w2
147 mov 16(up,n,8), %rax
148 mov $0, R32(w3)
149 mul v0
150 add %rax, w1
151 mov 16(up,n,8), %rax
152 adc %rdx, w2
153 adc $0, R32(w3)
154 mul v1
155 add %rax, w2
156 mov w0, 8(rp,n,8)
157 L(m23): adc %rdx, w3
158 mov 24(up,n,8), %rax
159 mul v0
160 mov $0, R32(w0)
161 add %rax, w2
162 adc %rdx, w3
163 mov w1, 16(rp,n,8)
164 mov 24(up,n,8), %rax
165 mov $0, R32(w1)
166 adc $0, R32(w0)
167 L(m22): mul v1
168 add %rax, w3
169 mov w2, 24(rp,n,8)
170 adc %rdx, w0
171 mov 32(up,n,8), %rax
172 mul v0
173 add $4, n
174 js L(m2top)
177 add %rax, w3
178 adc %rdx, w0
179 adc $0, R32(w1)
180 mov (up), %rax
181 mul v1
182 mov w3, (rp)
183 add %rax, w0
184 adc %rdx, w1
185 mov w0, 8(rp)
186 mov w1, %rax
188 pop %rbp
189 pop %rbx
190 FUNC_EXIT()
192 EPILOGUE()