beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreinhm / aorrlsh_n.asm
blobeed64e701e8b072e1112184d3fc8e9cb0f072865
1 dnl AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k)
2 dnl AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[]
3 dnl Optimised for Nehalem.
5 dnl Contributed to the GNU project by Torbjorn Granlund.
7 dnl Copyright 2011, 2012 Free Software Foundation, Inc.
9 dnl This file is part of the GNU MP Library.
10 dnl
11 dnl The GNU MP Library is free software; you can redistribute it and/or modify
12 dnl it under the terms of either:
13 dnl
14 dnl * the GNU Lesser General Public License as published by the Free
15 dnl Software Foundation; either version 3 of the License, or (at your
16 dnl option) any later version.
17 dnl
18 dnl or
19 dnl
20 dnl * the GNU General Public License as published by the Free Software
21 dnl Foundation; either version 2 of the License, or (at your option) any
22 dnl later version.
23 dnl
24 dnl or both in parallel, as here.
25 dnl
26 dnl The GNU MP Library is distributed in the hope that it will be useful, but
27 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29 dnl for more details.
30 dnl
31 dnl You should have received copies of the GNU General Public License and the
32 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
33 dnl see https://www.gnu.org/licenses/.
35 include(`../config.m4')
37 C cycles/limb
38 C AMD K8,K9 ?
39 C AMD K10 4.75
40 C Intel P4 ?
41 C Intel core2 2.8-3
42 C Intel NHM 2.8
43 C Intel SBR 3.55
44 C Intel atom ?
45 C VIA nano ?
47 C The inner-loop probably runs close to optimally on Nehalem (using 4-way
48 C unrolling). The rest of the code is quite crude, and could perhaps be made
49 C both smaller and faster.
51 C INPUT PARAMETERS
52 define(`rp', `%rdi')
53 define(`up', `%rsi')
54 define(`vp', `%rdx')
55 define(`n', `%rcx')
56 define(`cnt', `%r8')
57 define(`cy', `%r9') C for _nc variant
59 ifdef(`OPERATION_addlsh_n', `
60 define(ADDSUB, add)
61 define(ADCSBB, adc)
62 define(IFRSB, )
63 define(func_n, mpn_addlsh_n)
64 define(func_nc, mpn_addlsh_nc)')
65 ifdef(`OPERATION_rsblsh_n', `
66 define(ADDSUB, sub)
67 define(ADCSBB, sbb)
68 define(IFRSB, `$1')
69 define(func_n, mpn_rsblsh_n)
70 define(func_nc, mpn_rsblsh_nc)')
72 C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with
73 C refmpn_rsblsh_nc
74 MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
76 ABI_SUPPORT(DOS64)
77 ABI_SUPPORT(STD64)
79 ASM_START()
80 TEXT
81 ALIGN(32)
82 PROLOGUE(func_n)
83 FUNC_ENTRY(4)
84 IFDOS(` mov 56(%rsp), %r8d ') C cnt
85 push %rbx
86 xor R32(%rbx), R32(%rbx) C clear CF save register
87 L(ent): push %rbp
88 mov R32(n), R32(%rbp)
89 mov n, %rax
91 mov R32(cnt), R32(%rcx)
92 neg R32(%rcx)
94 lea -8(up,%rax,8), up
95 lea -8(vp,%rax,8), vp
96 lea -40(rp,%rax,8), rp
97 neg %rax
99 and $3, R32(%rbp)
100 jz L(b0)
101 cmp $2, R32(%rbp)
102 jc L(b1)
103 jz L(b2)
105 L(b3): xor R32(%r9), R32(%r9)
106 mov 8(vp,%rax,8), %r10
107 mov 16(vp,%rax,8), %r11
108 shrd %cl, %r10, %r9
109 shrd %cl, %r11, %r10
110 add R32(%rbx), R32(%rbx)
111 ADCSBB 8(up,%rax,8), %r9
112 mov 24(vp,%rax,8), %r8
113 ADCSBB 16(up,%rax,8), %r10
114 sbb R32(%rbx), R32(%rbx)
115 add $3, %rax
116 jmp L(lo3)
118 L(b0): mov 8(vp,%rax,8), %r9
119 xor R32(%r8), R32(%r8)
120 shrd %cl, %r9, %r8
121 mov 16(vp,%rax,8), %r10
122 mov 24(vp,%rax,8), %r11
123 shrd %cl, %r10, %r9
124 shrd %cl, %r11, %r10
125 add R32(%rbx), R32(%rbx)
126 ADCSBB 8(up,%rax,8), %r8
127 mov %r8, 40(rp,%rax,8) C offset 40
128 ADCSBB 16(up,%rax,8), %r9
129 mov 32(vp,%rax,8), %r8
130 ADCSBB 24(up,%rax,8), %r10
131 sbb R32(%rbx), R32(%rbx)
132 add $4, %rax
133 jmp L(lo0)
135 L(b1): mov 8(vp,%rax,8), %r8
136 add $1, %rax
137 jz L(1)
138 mov 8(vp,%rax,8), %r9
139 xor R32(%rbp), R32(%rbp)
140 jmp L(lo1)
141 L(1): xor R32(%r11), R32(%r11)
142 jmp L(wd1)
144 L(b2): xor %r10, %r10
145 mov 8(vp,%rax,8), %r11
146 shrd %cl, %r11, %r10
147 add R32(%rbx), R32(%rbx)
148 mov 16(vp,%rax,8), %r8
149 ADCSBB 8(up,%rax,8), %r10
150 sbb R32(%rbx), R32(%rbx)
151 add $2, %rax
152 jz L(end)
154 ALIGN(16)
155 L(top): mov 8(vp,%rax,8), %r9
156 mov %r11, %rbp
157 L(lo2): mov %r10, 24(rp,%rax,8) C offset 24
158 L(lo1): shrd %cl, %r8, %rbp
159 shrd %cl, %r9, %r8
160 mov 16(vp,%rax,8), %r10
161 mov 24(vp,%rax,8), %r11
162 shrd %cl, %r10, %r9
163 shrd %cl, %r11, %r10
164 add R32(%rbx), R32(%rbx)
165 ADCSBB (up,%rax,8), %rbp
166 ADCSBB 8(up,%rax,8), %r8
167 mov %r8, 40(rp,%rax,8) C offset 40
168 ADCSBB 16(up,%rax,8), %r9
169 mov 32(vp,%rax,8), %r8
170 ADCSBB 24(up,%rax,8), %r10
171 sbb R32(%rbx), R32(%rbx)
172 add $4, %rax
173 mov %rbp, (rp,%rax,8) C offset 32
174 L(lo0):
175 L(lo3): mov %r9, 16(rp,%rax,8) C offset 48
176 jnz L(top)
178 L(end): mov %r10, 24(rp,%rax,8)
179 L(wd1): shrd %cl, %r8, %r11
180 add R32(%rbx), R32(%rbx)
181 ADCSBB (up,%rax,8), %r11
182 mov %r11, 32(rp,%rax,8) C offset 32
183 adc R32(%rax), R32(%rax) C rax is zero after loop
184 shr R8(%rcx), %r8
185 ADDSUB %r8, %rax
186 IFRSB( neg %rax)
187 pop %rbp
188 pop %rbx
189 FUNC_EXIT()
191 EPILOGUE()
192 PROLOGUE(func_nc)
193 FUNC_ENTRY(4)
194 IFDOS(` mov 56(%rsp), %r8d ') C cnt
195 IFDOS(` mov 64(%rsp), %r9 ') C cy
196 push %rbx
197 neg cy
198 sbb R32(%rbx), R32(%rbx) C initialise CF save register
199 jmp L(ent)
200 EPILOGUE()