beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / bobcat / mul_1.asm
blobab428a88a096ce650c3cb01a97f95d7e6032e4c0
1 dnl AMD64 mpn_mul_1 optimised for AMD bobcat.
3 dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C AMD K8,K9 4.5
35 C AMD K10 4.5
36 C AMD bd1 4.62
37 C AMD bobcat 5
38 C Intel P4 14
39 C Intel core2 4.5
40 C Intel NHM 4.23
41 C Intel SBR 3.0
42 C Intel atom 21
43 C VIA nano 4.94
45 C The loop of this code is the result of running a code generation and
46 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
48 ABI_SUPPORT(DOS64)
49 ABI_SUPPORT(STD64)
51 C Standard parameters
52 define(`rp', `%rdi')
53 define(`up', `%rsi')
54 define(`n_param', `%rdx')
55 define(`v0', `%rcx')
56 define(`cy', `%r8')
57 C Standard allocations
58 define(`n', `%rbx')
59 define(`w0', `%r8')
60 define(`w1', `%r9')
61 define(`w2', `%r10')
62 define(`w3', `%r11')
64 C DOS64 parameters
65 IFDOS(` define(`rp', `%rcx') ') dnl
66 IFDOS(` define(`up', `%rsi') ') dnl
67 IFDOS(` define(`n_param', `%r8') ') dnl
68 IFDOS(` define(`v0', `%r9') ') dnl
69 IFDOS(` define(`cy', `56(%rsp)')') dnl
70 C DOS64 allocations
71 IFDOS(` define(`n', `%rbx') ') dnl
72 IFDOS(` define(`w0', `%r8') ') dnl
73 IFDOS(` define(`w1', `%rdi') ') dnl
74 IFDOS(` define(`w2', `%r10') ') dnl
75 IFDOS(` define(`w3', `%r11') ') dnl
77 ASM_START()
78 TEXT
79 ALIGN(16)
80 PROLOGUE(mpn_mul_1c)
81 IFDOS(` push %rsi ')
82 IFDOS(` push %rdi ')
83 IFDOS(` mov %rdx, %rsi ')
84 mov cy, w2
85 jmp L(com)
86 EPILOGUE()
88 PROLOGUE(mpn_mul_1)
89 IFDOS(` push %rsi ')
90 IFDOS(` push %rdi ')
91 IFDOS(` mov %rdx, %rsi ')
92 xor w2, w2
93 L(com): push %rbx
94 mov (up), %rax
96 lea -16(rp,n_param,8), rp
97 lea -16(up,n_param,8), up
99 mov n_param, n
100 and $3, R32(n_param)
101 jz L(b0)
102 cmp $2, R32(n_param)
103 ja L(b3)
104 jz L(b2)
106 L(b1): mul v0
107 cmp $1, n
108 jz L(n1)
109 neg n
110 add $3, n
111 add %rax, w2
112 mov %rdx, w3
113 jmp L(L1)
114 L(n1): add %rax, w2
115 mov %rdx, %rax
116 mov w2, 8(rp)
117 adc $0, %rax
118 pop %rbx
119 IFDOS(` pop %rdi ')
120 IFDOS(` pop %rsi ')
123 L(b3): mul v0
124 neg n
125 inc n
126 add %rax, w2
127 mov %rdx, w3
128 jmp L(L3)
130 L(b0): mul v0
131 mov %rax, w0
132 mov %rdx, w1
133 neg n
134 add $2, n
135 add w2, w0
136 jmp L(L0)
138 L(b2): mul v0
139 mov %rax, w0
140 mov %rdx, w1
141 neg n
142 add w2, w0
143 jmp L(L2)
145 ALIGN(16)
146 L(top): mov w0, -16(rp,n,8)
147 add w1, w2
148 L(L1): adc $0, w3
149 mov 0(up,n,8), %rax
150 mul v0
151 mov %rax, w0
152 mov %rdx, w1
153 mov w2, -8(rp,n,8)
154 add w3, w0
155 L(L0): adc $0, w1
156 mov 8(up,n,8), %rax
157 mul v0
158 mov %rax, w2
159 mov %rdx, w3
160 mov w0, 0(rp,n,8)
161 add w1, w2
162 L(L3): adc $0, w3
163 mov 16(up,n,8), %rax
164 mul v0
165 mov %rax, w0
166 mov %rdx, w1
167 mov w2, 8(rp,n,8)
168 add w3, w0
169 L(L2): adc $0, w1
170 mov 24(up,n,8), %rax
171 mul v0
172 mov %rax, w2
173 mov %rdx, w3
174 add $4, n
175 js L(top)
177 L(end): mov w0, (rp)
178 add w1, w2
179 adc $0, w3
180 mov w2, 8(rp)
181 mov w3, %rax
183 pop %rbx
184 IFDOS(` pop %rdi ')
185 IFDOS(` pop %rsi ')
187 EPILOGUE()