beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / bd1 / aorsmul_1.asm
blob96fec9f5ac51dff8707d2f086310da20f44489c5
1 dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer.
3 dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C AMD K8,K9
35 C AMD K10
36 C AMD bd1 4.5-4.7
37 C AMD bobcat
38 C Intel P4
39 C Intel core2
40 C Intel NHM
41 C Intel SBR
42 C Intel atom
43 C VIA nano
45 C The loop of this code is the result of running a code generation and
46 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
48 C TODO
49 C * Try to make loop run closer to 4 c/l.
51 define(`rp', `%rdi') C rcx
52 define(`up', `%rsi') C rdx
53 define(`n_param', `%rdx') C r8
54 define(`v0', `%rcx') C r9
56 define(`n', `%r11')
58 ifdef(`OPERATION_addmul_1',`
59 define(`ADDSUB', `add')
60 define(`func', `mpn_addmul_1')
62 ifdef(`OPERATION_submul_1',`
63 define(`ADDSUB', `sub')
64 define(`func', `mpn_submul_1')
67 ABI_SUPPORT(DOS64)
68 ABI_SUPPORT(STD64)
70 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
72 IFDOS(` define(`up', ``%rsi'') ') dnl
73 IFDOS(` define(`rp', ``%rcx'') ') dnl
74 IFDOS(` define(`v0', ``%r9'') ') dnl
75 IFDOS(` define(`r9', ``rdi'') ') dnl
76 IFDOS(` define(`n', ``%r8'') ') dnl
77 IFDOS(` define(`r8', ``r11'') ') dnl
79 ASM_START()
80 TEXT
81 ALIGN(16)
82 PROLOGUE(func)
83 IFDOS(``push %rsi '')
84 IFDOS(``push %rdi '')
85 IFDOS(``mov %rdx, %rsi '')
87 mov (up), %rax C read first u limb early
88 push %rbx
89 IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it
90 IFDOS(` mov n, %rbx ')
91 mul v0
93 IFSTD(` mov %rbx, n ')
95 and $3, R32(%rbx)
96 lea -16(rp,n,8), rp
97 jz L(b0)
98 cmp $2, R32(%rbx)
99 jb L(b1)
100 jz L(b2)
102 L(b3): mov $0, R32(%r8)
103 mov %rax, %rbx
104 mov $0, R32(%r9)
105 mov 8(up), %rax
106 mov %rdx, %r10
107 lea (up,n,8), up
108 not n
109 jmp L(L3)
111 L(b0): mov $0, R32(%r10)
112 mov %rax, %r8
113 mov %rdx, %rbx
114 mov 8(up), %rax
115 lea (up,n,8), up
116 neg n
117 jmp L(L0)
119 L(b1): cmp $1, n
120 jz L(n1)
121 mov %rax, %r9
122 mov 8(up), %rax
123 mov %rdx, %r8
124 mov $0, R32(%rbx)
125 lea (up,n,8), up
126 neg n
127 inc n
128 jmp L(L1)
130 L(b2): mov $0, R32(%rbx)
131 mov %rax, %r10
132 mov %rdx, %r9
133 mov 8(up), %rax
134 mov $0, R32(%r8)
135 lea (up,n,8), up
136 neg n
137 add $2, n
138 jns L(end)
140 ALIGN(32)
141 L(top): mul v0
142 ADDSUB %r10, (rp,n,8)
143 adc %rax, %r9
144 mov (up,n,8), %rax
145 adc %rdx, %r8
146 L(L1): mul v0
147 mov $0, R32(%r10)
148 ADDSUB %r9, 8(rp,n,8)
149 adc %rax, %r8
150 adc %rdx, %rbx
151 mov 8(up,n,8), %rax
152 L(L0): mul v0
153 ADDSUB %r8, 16(rp,n,8)
154 mov $0, R32(%r8)
155 adc %rax, %rbx
156 mov $0, R32(%r9)
157 mov 16(up,n,8), %rax
158 adc %rdx, %r10
159 L(L3): mul v0
160 ADDSUB %rbx, 24(rp,n,8)
161 mov $0, R32(%rbx)
162 adc %rax, %r10
163 adc %rdx, %r9
164 mov 24(up,n,8), %rax
165 add $4, n
166 js L(top)
168 L(end): mul v0
169 ADDSUB %r10, (rp)
170 adc %r9, %rax
171 adc %r8, %rdx
172 L(n1): ADDSUB %rax, 8(rp)
173 adc $0, %rdx
174 mov %rdx, %rax
176 pop %rbx
177 IFDOS(``pop %rdi '')
178 IFDOS(``pop %rsi '')
180 EPILOGUE()
181 ASM_END()