beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreisbr / aorrlsh_n.asm
blobdb8ee688496c79d3371ae750457e15e5ebb7a739
1 dnl AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k)
2 dnl AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[]
3 dnl Optimised for Sandy Bridge.
5 dnl Contributed to the GNU project by Torbjorn Granlund.
7 dnl Copyright 2011, 2012 Free Software Foundation, Inc.
9 dnl This file is part of the GNU MP Library.
10 dnl
11 dnl The GNU MP Library is free software; you can redistribute it and/or modify
12 dnl it under the terms of either:
13 dnl
14 dnl * the GNU Lesser General Public License as published by the Free
15 dnl Software Foundation; either version 3 of the License, or (at your
16 dnl option) any later version.
17 dnl
18 dnl or
19 dnl
20 dnl * the GNU General Public License as published by the Free Software
21 dnl Foundation; either version 2 of the License, or (at your option) any
22 dnl later version.
23 dnl
24 dnl or both in parallel, as here.
25 dnl
26 dnl The GNU MP Library is distributed in the hope that it will be useful, but
27 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29 dnl for more details.
30 dnl
31 dnl You should have received copies of the GNU General Public License and the
32 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
33 dnl see https://www.gnu.org/licenses/.
35 include(`../config.m4')
37 C cycles/limb
38 C AMD K8,K9 ?
39 C AMD K10 5.25
40 C Intel P4 ?
41 C Intel core2 3.1
42 C Intel NHM 3.95
43 C Intel SBR 2.75
44 C Intel atom ?
45 C VIA nano ?
47 C The inner-loop probably runs close to optimally on Sandy Bridge (using 4-way
48 C unrolling). The rest of the code is quite crude, and could perhaps be made
49 C both smaller and faster.
51 C INPUT PARAMETERS
52 define(`rp', `%rdi')
53 define(`up', `%rsi')
54 define(`vp', `%rdx')
55 define(`n', `%rcx')
56 define(`cnt', `%r8')
57 define(`cy', `%r9') C for _nc variant
59 ifdef(`OPERATION_addlsh_n', `
60 define(ADDSUB, add)
61 define(ADCSBB, adc)
62 define(IFRSB, )
63 define(func_n, mpn_addlsh_n)
64 define(func_nc, mpn_addlsh_nc)')
65 ifdef(`OPERATION_rsblsh_n', `
66 define(ADDSUB, sub)
67 define(ADCSBB, sbb)
68 define(IFRSB, `$1')
69 define(func_n, mpn_rsblsh_n)
70 define(func_nc, mpn_rsblsh_nc)')
72 ABI_SUPPORT(DOS64)
73 ABI_SUPPORT(STD64)
75 C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with
76 C refmpn_rsblsh_nc
77 MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
79 ASM_START()
80 TEXT
81 ALIGN(32)
82 PROLOGUE(func_n)
83 FUNC_ENTRY(4)
84 IFDOS(` mov 56(%rsp), %r8d ') C cnt
85 push %rbx
86 xor R32(%rbx), R32(%rbx) C clear CF save register
87 L(ent): push %rbp
88 mov R32(n), R32(%rbp)
89 mov n, %rax
90 mov R32(cnt), R32(%rcx)
91 neg R32(%rcx)
92 and $3, R32(%rbp)
93 jz L(b0)
94 lea -32(vp,%rbp,8), vp
95 lea -32(up,%rbp,8), up
96 lea -32(rp,%rbp,8), rp
97 cmp $2, R32(%rbp)
98 jc L(b1)
99 jz L(b2)
101 L(b3): xor %r8, %r8
102 mov 8(vp), %r9
103 mov 16(vp), %r10
104 shrd R8(%rcx), %r9, %r8
105 shrd R8(%rcx), %r10, %r9
106 mov 24(vp), %r11
107 shrd R8(%rcx), %r11, %r10
108 sub $3, %rax
109 jz L(3)
110 add R32(%rbx), R32(%rbx)
111 lea 32(vp), vp
112 ADCSBB 8(up), %r8
113 ADCSBB 16(up), %r9
114 ADCSBB 24(up), %r10
115 lea 32(up), up
116 jmp L(lo3)
117 L(3): add R32(%rbx), R32(%rbx)
118 lea 32(vp), vp
119 ADCSBB 8(up), %r8
120 ADCSBB 16(up), %r9
121 ADCSBB 24(up), %r10
122 jmp L(wd3)
124 L(b0): mov (vp), %r8
125 mov 8(vp), %r9
126 xor R32(%rbp), R32(%rbp)
127 jmp L(lo0)
129 L(b1): xor %r10, %r10
130 mov 24(vp), %r11
131 shrd R8(%rcx), %r11, %r10
132 sub $1, %rax
133 jz L(1)
134 add R32(%rbx), R32(%rbx)
135 lea 32(vp), vp
136 ADCSBB 24(up), %r10
137 lea 32(up), up
138 mov (vp), %r8
139 jmp L(lo1)
140 L(1): add R32(%rbx), R32(%rbx)
141 ADCSBB 24(up), %r10
142 jmp L(wd1)
144 L(b2): xor %r9, %r9
145 mov 16(vp), %r10
146 shrd R8(%rcx), %r10, %r9
147 mov 24(vp), %r11
148 shrd R8(%rcx), %r11, %r10
149 sub $2, %rax
150 jz L(2)
151 add R32(%rbx), R32(%rbx)
152 lea 32(vp), vp
153 ADCSBB 16(up), %r9
154 ADCSBB 24(up), %r10
155 lea 32(up), up
156 jmp L(lo2)
157 L(2): add R32(%rbx), R32(%rbx)
158 ADCSBB 16(up), %r9
159 ADCSBB 24(up), %r10
160 jmp L(wd2)
162 ALIGN(32) C 16-byte alignment is not enough!
163 L(top): shrd R8(%rcx), %r11, %r10
164 add R32(%rbx), R32(%rbx)
165 lea 32(vp), vp
166 ADCSBB (up), %rbp
167 ADCSBB 8(up), %r8
168 ADCSBB 16(up), %r9
169 ADCSBB 24(up), %r10
170 mov %rbp, (rp)
171 lea 32(up), up
172 L(lo3): mov %r8, 8(rp)
173 L(lo2): mov %r9, 16(rp)
174 mov (vp), %r8
175 L(lo1): mov %r10, 24(rp)
176 mov 8(vp), %r9
177 mov %r11, %rbp
178 lea 32(rp), rp
179 sbb R32(%rbx), R32(%rbx)
180 L(lo0): shrd R8(%rcx), %r8, %rbp
181 mov 16(vp), %r10
182 shrd R8(%rcx), %r9, %r8
183 shrd R8(%rcx), %r10, %r9
184 mov 24(vp), %r11
185 sub $4, %rax
186 jg L(top)
188 shrd R8(%rcx), %r11, %r10
189 add R32(%rbx), R32(%rbx)
190 ADCSBB (up), %rbp
191 ADCSBB 8(up), %r8
192 ADCSBB 16(up), %r9
193 ADCSBB 24(up), %r10
194 mov %rbp, (rp)
195 L(wd3): mov %r8, 8(rp)
196 L(wd2): mov %r9, 16(rp)
197 L(wd1): mov %r10, 24(rp)
198 adc R32(%rax), R32(%rax) C rax is zero after loop
199 shr R8(%rcx), %r11
200 ADDSUB %r11, %rax
201 IFRSB( neg %rax)
202 pop %rbp
203 pop %rbx
204 FUNC_EXIT()
206 EPILOGUE()
207 PROLOGUE(func_nc)
208 FUNC_ENTRY(4)
209 IFDOS(` mov 56(%rsp), %r8d ') C cnt
210 IFDOS(` mov 64(%rsp), %r9 ') C cy
211 push %rbx
212 neg cy
213 sbb R32(%rbx), R32(%rbx) C initialise CF save register
214 jmp L(ent)
215 EPILOGUE()