beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreibwl / mul_1.asm
bloba271e6cc8675c06b3fb3a3efd54dc424ef6aa14c
1 dnl AMD64 mpn_mul_1 optimised for Intel Broadwell.
3 dnl Copyright 2015 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C AMD K8,K9 n/a
35 C AMD K10 n/a
36 C AMD bull n/a
37 C AMD pile n/a
38 C AMD steam n/a
39 C AMD bobcat n/a
40 C AMD jaguar n/a
41 C Intel P4 n/a
42 C Intel core2 n/a
43 C Intel NHM n/a
44 C Intel SBR n/a
45 C Intel IBR n/a
46 C Intel HWL 1.68
47 C Intel BWL 1.69
48 C Intel atom n/a
49 C Intel SLM n/a
50 C VIA nano n/a
52 C The loop of this code is the result of running a code generation and
53 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
55 C TODO
56 C * Put an initial mulx before switching, targeting some free registers.
57 C * Tune feed-in code.
58 C * Trim nop execution after L(f2).
59 C * Port to DOS64, not forgetting nop execution.
61 define(`rp', `%rdi') C rcx
62 define(`up', `%rsi') C rdx
63 define(`n_param', `%rdx') C r8
64 define(`v0_param',`%rcx') C r9
66 define(`n', `%rcx')
68 dnl ABI_SUPPORT(DOS64)
69 ABI_SUPPORT(STD64)
71 dnl IFDOS(` define(`up', ``%rsi'') ') dnl
72 dnl IFDOS(` define(`rp', ``%rcx'') ') dnl
73 dnl IFDOS(` define(`vl', ``%r9'') ') dnl
74 dnl IFDOS(` define(`r9', ``rdi'') ') dnl
75 dnl IFDOS(` define(`n', ``%r8'') ') dnl
76 dnl IFDOS(` define(`r8', ``r11'') ') dnl
78 ASM_START()
79 TEXT
80 ALIGN(32)
81 PROLOGUE(mpn_mul_1)
83 mov v0_param, %r10
84 mov n_param, n
85 mov R32(n_param), R32(%r8)
86 shr $3, n
87 and $7, R32(%r8) C clear OF, CF as side-effect
88 mov %r10, %rdx
89 lea L(tab)(%rip), %r10
90 ifdef(`PIC',
91 ` movslq (%r10,%r8,4), %r8
92 lea (%r8, %r10), %r10
93 jmp *%r10
94 ',`
95 jmp *(%r10,%r8,8)
97 JUMPTABSECT
98 ALIGN(8)
99 L(tab): JMPENT( L(f0), L(tab))
100 JMPENT( L(f1), L(tab))
101 JMPENT( L(f2), L(tab))
102 JMPENT( L(f3), L(tab))
103 JMPENT( L(f4), L(tab))
104 JMPENT( L(f5), L(tab))
105 JMPENT( L(f6), L(tab))
106 JMPENT( L(f7), L(tab))
107 TEXT
109 L(f0): mulx( (up), %r10, %r8)
110 lea 56(up), up
111 lea -8(rp), rp
112 jmp L(b0)
114 L(f3): mulx( (up), %r9, %rax)
115 lea 16(up), up
116 lea 16(rp), rp
117 inc n
118 jmp L(b3)
120 L(f4): mulx( (up), %r10, %r8)
121 lea 24(up), up
122 lea 24(rp), rp
123 inc n
124 jmp L(b4)
126 L(f5): mulx( (up), %r9, %rax)
127 lea 32(up), up
128 lea 32(rp), rp
129 inc n
130 jmp L(b5)
132 L(f6): mulx( (up), %r10, %r8)
133 lea 40(up), up
134 lea 40(rp), rp
135 inc n
136 jmp L(b6)
138 L(f7): mulx( (up), %r9, %rax)
139 lea 48(up), up
140 lea 48(rp), rp
141 inc n
142 jmp L(b7)
144 L(f1): mulx( (up), %r9, %rax)
145 test n, n
146 jnz L(b1)
147 L(1): mov %r9, (rp)
150 L(f2): mulx( (up), %r10, %r8)
151 lea 8(up), up
152 lea 8(rp), rp
153 mulx( (up), %r9, %rax)
154 test n, n
155 jz L(end)
157 ALIGN(32)
158 L(top): mov %r10, -8(rp)
159 adc %r8, %r9
160 L(b1): mulx( 8,(up), %r10, %r8)
161 adc %rax, %r10
162 lea 64(up), up
163 mov %r9, (rp)
164 L(b0): mov %r10, 8(rp)
165 mulx( -48,(up), %r9, %rax)
166 lea 64(rp), rp
167 adc %r8, %r9
168 L(b7): mulx( -40,(up), %r10, %r8)
169 mov %r9, -48(rp)
170 adc %rax, %r10
171 L(b6): mov %r10, -40(rp)
172 mulx( -32,(up), %r9, %rax)
173 adc %r8, %r9
174 L(b5): mulx( -24,(up), %r10, %r8)
175 mov %r9, -32(rp)
176 adc %rax, %r10
177 L(b4): mulx( -16,(up), %r9, %rax)
178 mov %r10, -24(rp)
179 adc %r8, %r9
180 L(b3): mulx( -8,(up), %r10, %r8)
181 adc %rax, %r10
182 mov %r9, -16(rp)
183 dec n
184 mulx( (up), %r9, %rax)
185 jnz L(top)
187 L(end): mov %r10, -8(rp)
188 adc %r8, %r9
189 mov %r9, (rp)
190 adc %rcx, %rax
192 EPILOGUE()
193 ASM_END()