beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreisbr / aorsmul_1.asm
blob9f01d9c061224673a7bcc52d13bf9e99be5376e5
1 dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Sandy Bridge.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9
37 C AMD K10
38 C AMD bull
39 C AMD pile
40 C AMD steam
41 C AMD bobcat
42 C AMD jaguar
43 C Intel P4
44 C Intel core
45 C Intel NHM
46 C Intel SBR 3.24 (average, fluctuating in 3.20-3.57)
47 C Intel IBR 3.04
48 C Intel HWL
49 C Intel BWL
50 C Intel atom
51 C VIA nano
53 C The loop of this code is the result of running a code generation and
54 C optimization tool suite written by David Harvey and Torbjörn Granlund.
56 define(`rp', `%rdi') C rcx
57 define(`up', `%rsi') C rdx
58 define(`n_param', `%rdx') C r8
59 define(`v0', `%rcx') C r9
61 define(`n', `%rbx')
63 define(`I',`$1')
65 ifdef(`OPERATION_addmul_1',`
66 define(`ADDSUB', `add')
67 define(`func', `mpn_addmul_1')
69 ifdef(`OPERATION_submul_1',`
70 define(`ADDSUB', `sub')
71 define(`func', `mpn_submul_1')
74 ABI_SUPPORT(DOS64)
75 ABI_SUPPORT(STD64)
77 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
79 IFDOS(` define(`up', ``%rsi'')') dnl
80 IFDOS(` define(`rp', ``%rcx'')') dnl
81 IFDOS(` define(`v0', ``%r9'')') dnl
82 IFDOS(` define(`r9', ``rdi'')') dnl
83 IFDOS(` define(`n_param',``%r8'')') dnl
85 ASM_START()
86 TEXT
87 ALIGN(32)
88 PROLOGUE(func)
90 IFDOS(``push %rsi '')
91 IFDOS(``push %rdi '')
92 IFDOS(``mov %rdx, %rsi '')
94 mov (up), %rax
95 push %rbx
96 lea (up,n_param,8), up
97 lea (rp,n_param,8), rp
99 test $1, R8(n_param)
100 jnz L(b13)
102 L(b02): xor R32(%r11), R32(%r11)
103 test $2, R8(n_param)
104 jnz L(b2)
106 L(b0): mov $1, R32(n)
107 sub n_param, n
108 mul v0
109 mov %rdx, %r9
110 mov -8(rp,n,8), %r8
111 jmp L(e0)
113 ALIGN(16)
114 L(b2): mov $-1, n
115 sub n_param, n
116 mul v0
117 mov 8(rp,n,8), %r8
118 mov %rdx, %r9
119 jmp L(e2)
121 ALIGN(16)
122 L(b13): xor R32(%r9), R32(%r9)
123 test $2, R8(n_param)
124 jnz L(b3)
126 L(b1): mov $2, R32(n)
127 sub n_param, n
128 jns L(1)
129 mul v0
130 mov -16(rp,n,8), %r10
131 mov %rdx, %r11
132 jmp L(e1)
134 ALIGN(16)
135 L(b3): xor R32(n), R32(n)
136 sub n_param, n
137 mul v0
138 mov (rp,n,8), %r10
139 jmp L(e3)
141 ALIGN(32)
142 L(top): mul v0
143 mov -16(rp,n,8), %r10
144 ADDSUB %r11, %r8
145 mov %rdx, %r11
146 adc $0, %r9
147 mov %r8, -24(rp,n,8)
148 L(e1): ADDSUB %rax, %r10
149 mov -8(up,n,8), %rax
150 adc $0, %r11
151 mul v0
152 ADDSUB %r9, %r10
153 mov %rdx, %r9
154 mov -8(rp,n,8), %r8
155 adc $0, %r11
156 mov %r10, -16(rp,n,8)
157 L(e0): ADDSUB %rax, %r8
158 adc $0, %r9
159 mov (up,n,8), %rax
160 mul v0
161 mov (rp,n,8), %r10
162 ADDSUB %r11, %r8
163 mov %r8, -8(rp,n,8)
164 adc $0, %r9
165 L(e3): mov %rdx, %r11
166 ADDSUB %rax, %r10
167 mov 8(up,n,8), %rax
168 adc $0, %r11
169 mul v0
170 mov 8(rp,n,8), %r8
171 ADDSUB %r9, %r10
172 mov %rdx, %r9
173 mov %r10, (rp,n,8)
174 adc $0, %r11
175 L(e2): ADDSUB %rax, %r8
176 adc $0, %r9
177 mov 16(up,n,8), %rax
178 add $4, n
179 jnc L(top)
181 L(end): mul v0
182 mov I(-8(rp),-16(rp,n,8)), %r10
183 ADDSUB %r11, %r8
184 mov %rdx, %r11
185 adc $0, %r9
186 mov %r8, I(-16(rp),-24(rp,n,8))
187 ADDSUB %rax, %r10
188 adc $0, %r11
189 ADDSUB %r9, %r10
190 adc $0, %r11
191 mov %r10, I(-8(rp),-16(rp,n,8))
192 mov %r11, %rax
194 pop %rbx
195 IFDOS(``pop %rdi '')
196 IFDOS(``pop %rsi '')
199 ALIGN(16)
200 L(1): mul v0
201 ADDSUB %rax, -8(rp)
202 mov %rdx, %rax
203 adc $0, %rax
204 pop %rbx
205 IFDOS(``pop %rdi '')
206 IFDOS(``pop %rsi '')
208 EPILOGUE()
209 ASM_END()