beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreinhm / aorsmul_1.asm
blobb768905b93921b4d33a1b2cdb06174d43aacdbf3
1 dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Nehalem.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9
37 C AMD K10
38 C AMD bull
39 C AMD pile
40 C AMD bobcat
41 C AMD jaguar
42 C Intel P4
43 C Intel core
44 C Intel NHM 4.55 with minor fluctuations
45 C Intel SBR
46 C Intel IBR
47 C Intel HWL
48 C Intel BWL
49 C Intel atom
50 C VIA nano
52 C The loop of this code is the result of running a code generation and
53 C optimization tool suite written by David Harvey and Torbjorn Granlund.
55 C N.B.: Be careful if editing, making sure the loop alignment padding does not
56 C become large, as we currently fall into it.
58 define(`rp', `%rdi') C rcx
59 define(`up', `%rsi') C rdx
60 define(`n_param', `%rdx') C r8
61 define(`v0', `%rcx') C r9
63 define(`n', `%rbx')
65 ifdef(`OPERATION_addmul_1',`
66 define(`ADDSUB', `add')
67 define(`func', `mpn_addmul_1')
69 ifdef(`OPERATION_submul_1',`
70 define(`ADDSUB', `sub')
71 define(`func', `mpn_submul_1')
74 ABI_SUPPORT(DOS64)
75 ABI_SUPPORT(STD64)
77 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
79 ASM_START()
80 TEXT
81 ALIGN(32)
82 PROLOGUE(func)
83 FUNC_ENTRY(4)
84 push %rbx
86 mov (up), %rax
87 lea -8(up,n_param,8), up
88 mov (rp), %r8
89 lea -8(rp,n_param,8), rp
91 test $1, R8(n_param)
92 jnz L(bx1)
94 L(bx0): test $2, R8(n_param)
95 jnz L(b10)
97 L(b00): mov $3, R32(n)
98 sub n_param, n
99 mul v0
100 mov $0, R32(%r11)
101 mov %r8, %r10
102 ADDSUB %rax, %r10
103 mov -8(up,n,8), %rax
104 adc %rdx, %r11
105 jmp L(lo0)
107 L(b10): mov $1, R32(n)
108 sub n_param, n
109 mul v0
110 mov %r8, %r10
111 mov $0, R32(%r11)
112 ADDSUB %rax, %r10
113 mov 8(up,n,8), %rax
114 adc %rdx, %r11
115 jmp L(lo2)
117 L(bx1): test $2, R8(n_param)
118 jz L(b01)
120 L(b11): mov $2, R32(n)
121 sub n_param, n
122 mul v0
123 ADDSUB %rax, %r8
124 mov $0, R32(%r9)
125 mov (up,n,8), %rax
126 adc %rdx, %r9
127 jmp L(lo3)
129 L(b01): mov $0, R32(n)
130 sub n_param, n
131 xor %r11, %r11
132 add $4, n
133 jc L(end)
135 ALIGN(32)
136 L(top): mul v0
137 ADDSUB %rax, %r8
138 mov $0, R32(%r9)
139 mov -16(up,n,8), %rax
140 adc %rdx, %r9
141 L(lo1): mul v0
142 ADDSUB %r11, %r8
143 mov $0, R32(%r11)
144 mov -16(rp,n,8), %r10
145 adc $0, %r9
146 ADDSUB %rax, %r10
147 mov -8(up,n,8), %rax
148 adc %rdx, %r11
149 mov %r8, -24(rp,n,8)
150 ADDSUB %r9, %r10
151 adc $0, %r11
152 L(lo0): mov -8(rp,n,8), %r8
153 mul v0
154 ADDSUB %rax, %r8
155 mov $0, R32(%r9)
156 mov (up,n,8), %rax
157 adc %rdx, %r9
158 mov %r10, -16(rp,n,8)
159 ADDSUB %r11, %r8
160 adc $0, %r9
161 L(lo3): mul v0
162 mov (rp,n,8), %r10
163 mov $0, R32(%r11)
164 ADDSUB %rax, %r10
165 mov 8(up,n,8), %rax
166 adc %rdx, %r11
167 mov %r8, -8(rp,n,8)
168 ADDSUB %r9, %r10
169 adc $0, %r11
170 L(lo2): mov 8(rp,n,8), %r8
171 mov %r10, (rp,n,8)
172 add $4, n
173 jnc L(top)
175 L(end): mul v0
176 ADDSUB %rax, %r8
177 mov $0, R32(%rax)
178 adc %rdx, %rax
179 ADDSUB %r11, %r8
180 adc $0, %rax
181 mov %r8, (rp)
183 pop %rbx
184 FUNC_EXIT()
186 EPILOGUE()
187 ASM_END()