beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / addmul_2.asm
blob18307d719fcc245b6ed8ff8eee1e8dc36e3646f6
1 dnl AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and
2 dnl add the result to a third limb vector.
4 dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C cycles/limb
35 C AMD K8,K9 2.375
36 C AMD K10 2.375
37 C Intel P4 15-16
38 C Intel core2 4.45
39 C Intel NHM 4.32
40 C Intel SBR 3.4
41 C Intel atom ?
42 C VIA nano 4.4
44 C This code is the result of running a code generation and optimization tool
45 C suite written by David Harvey and Torbjorn Granlund.
47 C TODO
48 C * Tune feed-in and wind-down code.
50 C INPUT PARAMETERS
51 define(`rp', `%rdi')
52 define(`up', `%rsi')
53 define(`n_param',`%rdx')
54 define(`vp', `%rcx')
56 define(`v0', `%r8')
57 define(`v1', `%r9')
58 define(`w0', `%rbx')
59 define(`w1', `%rcx')
60 define(`w2', `%rbp')
61 define(`w3', `%r10')
62 define(`n', `%r11')
64 ABI_SUPPORT(DOS64)
65 ABI_SUPPORT(STD64)
67 ASM_START()
68 TEXT
69 ALIGN(16)
70 PROLOGUE(mpn_addmul_2)
71 FUNC_ENTRY(4)
72 mov n_param, n
73 push %rbx
74 push %rbp
76 mov 0(vp), v0
77 mov 8(vp), v1
79 mov R32(n_param), R32(%rbx)
80 mov (up), %rax
81 lea -8(up,n_param,8), up
82 lea -8(rp,n_param,8), rp
83 mul v0
84 neg n
85 and $3, R32(%rbx)
86 jz L(b0)
87 cmp $2, R32(%rbx)
88 jc L(b1)
89 jz L(b2)
91 L(b3): mov %rax, w1
92 mov %rdx, w2
93 xor R32(w3), R32(w3)
94 mov 8(up,n,8), %rax
95 dec n
96 jmp L(lo3)
98 L(b2): mov %rax, w2
99 mov 8(up,n,8), %rax
100 mov %rdx, w3
101 xor R32(w0), R32(w0)
102 add $-2, n
103 jmp L(lo2)
105 L(b1): mov %rax, w3
106 mov 8(up,n,8), %rax
107 mov %rdx, w0
108 xor R32(w1), R32(w1)
109 inc n
110 jmp L(lo1)
112 L(b0): mov $0, R32(w3)
113 mov %rax, w0
114 mov 8(up,n,8), %rax
115 mov %rdx, w1
116 xor R32(w2), R32(w2)
117 jmp L(lo0)
119 ALIGN(32)
120 L(top): mov $0, R32(w1)
121 mul v0
122 add %rax, w3
123 mov (up,n,8), %rax
124 adc %rdx, w0
125 adc $0, R32(w1)
126 L(lo1): mul v1
127 add w3, (rp,n,8)
128 mov $0, R32(w3)
129 adc %rax, w0
130 mov $0, R32(w2)
131 mov 8(up,n,8), %rax
132 adc %rdx, w1
133 mul v0
134 add %rax, w0
135 mov 8(up,n,8), %rax
136 adc %rdx, w1
137 adc $0, R32(w2)
138 L(lo0): mul v1
139 add w0, 8(rp,n,8)
140 adc %rax, w1
141 adc %rdx, w2
142 mov 16(up,n,8), %rax
143 mul v0
144 add %rax, w1
145 adc %rdx, w2
146 adc $0, R32(w3)
147 mov 16(up,n,8), %rax
148 L(lo3): mul v1
149 add w1, 16(rp,n,8)
150 adc %rax, w2
151 adc %rdx, w3
152 xor R32(w0), R32(w0)
153 mov 24(up,n,8), %rax
154 mul v0
155 add %rax, w2
156 mov 24(up,n,8), %rax
157 adc %rdx, w3
158 adc $0, R32(w0)
159 L(lo2): mul v1
160 add w2, 24(rp,n,8)
161 adc %rax, w3
162 adc %rdx, w0
163 mov 32(up,n,8), %rax
164 add $4, n
165 js L(top)
167 L(end): xor R32(w1), R32(w1)
168 mul v0
169 add %rax, w3
170 mov (up), %rax
171 adc %rdx, w0
172 adc R32(w1), R32(w1)
173 mul v1
174 add w3, (rp)
175 adc %rax, w0
176 adc %rdx, w1
177 mov w0, 8(rp)
178 mov w1, %rax
180 pop %rbp
181 pop %rbx
182 FUNC_EXIT()
184 EPILOGUE()