beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / aorsmul_1.asm
blobe3fc005757f328bdf6d4b317d5d06924eb9701f3
1 dnl AMD64 mpn_addmul_1 and mpn_submul_1.
3 dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C AMD K8,K9 2.5
35 C AMD K10 2.5
36 C AMD bd1 5.0
37 C AMD bobcat 6.17
38 C Intel P4 14.9
39 C Intel core2 5.09
40 C Intel NHM 4.9
41 C Intel SBR 4.0
42 C Intel atom 21.3
43 C VIA nano 5.0
45 C The loop of this code is the result of running a code generation and
46 C optimization tool suite written by David Harvey and Torbjorn Granlund.
48 C TODO
49 C * The loop is great, but the prologue and epilogue code was quickly written.
50 C Tune it!
52 define(`rp', `%rdi') C rcx
53 define(`up', `%rsi') C rdx
54 define(`n_param', `%rdx') C r8
55 define(`vl', `%rcx') C r9
57 define(`n', `%r11')
59 ifdef(`OPERATION_addmul_1',`
60 define(`ADDSUB', `add')
61 define(`func', `mpn_addmul_1')
63 ifdef(`OPERATION_submul_1',`
64 define(`ADDSUB', `sub')
65 define(`func', `mpn_submul_1')
68 ABI_SUPPORT(DOS64)
69 ABI_SUPPORT(STD64)
71 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
73 IFDOS(` define(`up', ``%rsi'') ') dnl
74 IFDOS(` define(`rp', ``%rcx'') ') dnl
75 IFDOS(` define(`vl', ``%r9'') ') dnl
76 IFDOS(` define(`r9', ``rdi'') ') dnl
77 IFDOS(` define(`n', ``%r8'') ') dnl
78 IFDOS(` define(`r8', ``r11'') ') dnl
80 ASM_START()
81 TEXT
82 ALIGN(16)
83 PROLOGUE(func)
85 IFDOS(``push %rsi '')
86 IFDOS(``push %rdi '')
87 IFDOS(``mov %rdx, %rsi '')
89 mov (up), %rax C read first u limb early
90 push %rbx
91 IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it
92 IFDOS(` mov n, %rbx ')
93 mul vl
94 IFSTD(` mov %rbx, n ')
96 and $3, R32(%rbx)
97 jz L(b0)
98 cmp $2, R32(%rbx)
99 jz L(b2)
100 jg L(b3)
102 L(b1): dec n
103 jne L(gt1)
104 ADDSUB %rax, (rp)
105 jmp L(ret)
106 L(gt1): lea 8(up,n,8), up
107 lea -8(rp,n,8), rp
108 neg n
109 xor %r10, %r10
110 xor R32(%rbx), R32(%rbx)
111 mov %rax, %r9
112 mov (up,n,8), %rax
113 mov %rdx, %r8
114 jmp L(L1)
116 L(b0): lea (up,n,8), up
117 lea -16(rp,n,8), rp
118 neg n
119 xor %r10, %r10
120 mov %rax, %r8
121 mov %rdx, %rbx
122 jmp L(L0)
124 L(b3): lea -8(up,n,8), up
125 lea -24(rp,n,8), rp
126 neg n
127 mov %rax, %rbx
128 mov %rdx, %r10
129 jmp L(L3)
131 L(b2): lea -16(up,n,8), up
132 lea -32(rp,n,8), rp
133 neg n
134 xor %r8, %r8
135 xor R32(%rbx), R32(%rbx)
136 mov %rax, %r10
137 mov 24(up,n,8), %rax
138 mov %rdx, %r9
139 jmp L(L2)
141 ALIGN(16)
142 L(top): ADDSUB %r10, (rp,n,8)
143 adc %rax, %r9
144 mov (up,n,8), %rax
145 adc %rdx, %r8
146 mov $0, R32(%r10)
147 L(L1): mul vl
148 ADDSUB %r9, 8(rp,n,8)
149 adc %rax, %r8
150 adc %rdx, %rbx
151 L(L0): mov 8(up,n,8), %rax
152 mul vl
153 ADDSUB %r8, 16(rp,n,8)
154 adc %rax, %rbx
155 adc %rdx, %r10
156 L(L3): mov 16(up,n,8), %rax
157 mul vl
158 ADDSUB %rbx, 24(rp,n,8)
159 mov $0, R32(%r8) C zero
160 mov %r8, %rbx C zero
161 adc %rax, %r10
162 mov 24(up,n,8), %rax
163 mov %r8, %r9 C zero
164 adc %rdx, %r9
165 L(L2): mul vl
166 add $4, n
167 js L(top)
169 ADDSUB %r10, (rp,n,8)
170 adc %rax, %r9
171 adc %r8, %rdx
172 ADDSUB %r9, 8(rp,n,8)
173 L(ret): adc $0, %rdx
174 mov %rdx, %rax
176 pop %rbx
177 IFDOS(``pop %rdi '')
178 IFDOS(``pop %rsi '')
180 EPILOGUE()