beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / mul_1.asm
blobb032afc9ddb7a5931436bbb6166d3dab0e6d99a6
1 dnl AMD64 mpn_mul_1.
3 dnl Copyright 2003-2005, 2007, 2008, 2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C AMD K8,K9 2.5
35 C AMD K10 2.5
36 C AMD bd1 5.0
37 C AMD bobcat 5.5
38 C Intel P4 12.3
39 C Intel core2 4.0
40 C Intel NHM 3.75
41 C Intel SBR 2.95
42 C Intel atom 19.8
43 C VIA nano 4.25
45 C The loop of this code is the result of running a code generation and
46 C optimization tool suite written by David Harvey and Torbjorn Granlund.
48 C TODO
49 C * The loop is great, but the prologue and epilogue code was quickly written.
50 C Tune it!
52 define(`rp', `%rdi') C rcx
53 define(`up', `%rsi') C rdx
54 define(`n_param', `%rdx') C r8
55 define(`vl', `%rcx') C r9
57 define(`n', `%r11')
59 ABI_SUPPORT(DOS64)
60 ABI_SUPPORT(STD64)
62 IFDOS(` define(`up', ``%rsi'') ') dnl
63 IFDOS(` define(`rp', ``%rcx'') ') dnl
64 IFDOS(` define(`vl', ``%r9'') ') dnl
65 IFDOS(` define(`r9', ``rdi'') ') dnl
66 IFDOS(` define(`n', ``%r8'') ') dnl
67 IFDOS(` define(`r8', ``r11'') ') dnl
69 ASM_START()
70 TEXT
71 ALIGN(16)
72 PROLOGUE(mpn_mul_1c)
73 IFDOS(``push %rsi '')
74 IFDOS(``push %rdi '')
75 IFDOS(``mov %rdx, %rsi '')
76 push %rbx
77 IFSTD(` mov %r8, %r10')
78 IFDOS(` mov 64(%rsp), %r10') C 40 + 3*8 (3 push insns)
79 jmp L(common)
80 EPILOGUE()
82 PROLOGUE(mpn_mul_1)
83 IFDOS(``push %rsi '')
84 IFDOS(``push %rdi '')
85 IFDOS(``mov %rdx, %rsi '')
87 push %rbx
88 xor %r10, %r10
89 L(common):
90 mov (up), %rax C read first u limb early
91 IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it
92 IFDOS(` mov n, %rbx ')
93 mul vl
94 IFSTD(` mov %rbx, n ')
96 add %r10, %rax
97 adc $0, %rdx
99 and $3, R32(%rbx)
100 jz L(b0)
101 cmp $2, R32(%rbx)
102 jz L(b2)
103 jg L(b3)
105 L(b1): dec n
106 jne L(gt1)
107 mov %rax, (rp)
108 jmp L(ret)
109 L(gt1): lea 8(up,n,8), up
110 lea -8(rp,n,8), rp
111 neg n
112 xor %r10, %r10
113 xor R32(%rbx), R32(%rbx)
114 mov %rax, %r9
115 mov (up,n,8), %rax
116 mov %rdx, %r8
117 jmp L(L1)
119 L(b0): lea (up,n,8), up
120 lea -16(rp,n,8), rp
121 neg n
122 xor %r10, %r10
123 mov %rax, %r8
124 mov %rdx, %rbx
125 jmp L(L0)
127 L(b3): lea -8(up,n,8), up
128 lea -24(rp,n,8), rp
129 neg n
130 mov %rax, %rbx
131 mov %rdx, %r10
132 jmp L(L3)
134 L(b2): lea -16(up,n,8), up
135 lea -32(rp,n,8), rp
136 neg n
137 xor %r8, %r8
138 xor R32(%rbx), R32(%rbx)
139 mov %rax, %r10
140 mov 24(up,n,8), %rax
141 mov %rdx, %r9
142 jmp L(L2)
144 ALIGN(16)
145 L(top): mov %r10, (rp,n,8)
146 add %rax, %r9
147 mov (up,n,8), %rax
148 adc %rdx, %r8
149 mov $0, R32(%r10)
150 L(L1): mul vl
151 mov %r9, 8(rp,n,8)
152 add %rax, %r8
153 adc %rdx, %rbx
154 L(L0): mov 8(up,n,8), %rax
155 mul vl
156 mov %r8, 16(rp,n,8)
157 add %rax, %rbx
158 adc %rdx, %r10
159 L(L3): mov 16(up,n,8), %rax
160 mul vl
161 mov %rbx, 24(rp,n,8)
162 mov $0, R32(%r8) C zero
163 mov %r8, %rbx C zero
164 add %rax, %r10
165 mov 24(up,n,8), %rax
166 mov %r8, %r9 C zero
167 adc %rdx, %r9
168 L(L2): mul vl
169 add $4, n
170 js L(top)
172 mov %r10, (rp,n,8)
173 add %rax, %r9
174 adc %r8, %rdx
175 mov %r9, 8(rp,n,8)
176 add %r8, %rdx
177 L(ret): mov %rdx, %rax
179 pop %rbx
180 IFDOS(``pop %rdi '')
181 IFDOS(``pop %rsi '')
183 EPILOGUE()