beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium / lshift.asm
blob2a31f36c6edd21f65516bebe4569f0df57cae0e7
1 dnl Intel Pentium mpn_lshift -- mpn left shift.
3 dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C P5,P54: 6.0
36 C P55: 5.375
39 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
40 C unsigned shift);
42 C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
43 C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
45 defframe(PARAM_SHIFT,16)
46 defframe(PARAM_SIZE, 12)
47 defframe(PARAM_SRC, 8)
48 defframe(PARAM_DST, 4)
50 TEXT
51 ALIGN(8)
52 PROLOGUE(mpn_lshift)
54 pushl %edi
55 pushl %esi
56 pushl %ebx
57 pushl %ebp
58 deflit(`FRAME',16)
60 movl PARAM_DST,%edi
61 movl PARAM_SRC,%esi
62 movl PARAM_SIZE,%ebp
63 movl PARAM_SHIFT,%ecx
65 C We can use faster code for shift-by-1 under certain conditions.
66 cmp $1,%ecx
67 jne L(normal)
68 leal 4(%esi),%eax
69 cmpl %edi,%eax
70 jnc L(special) C jump if s_ptr + 1 >= res_ptr
71 leal (%esi,%ebp,4),%eax
72 cmpl %eax,%edi
73 jnc L(special) C jump if res_ptr >= s_ptr + size
75 L(normal):
76 leal -4(%edi,%ebp,4),%edi
77 leal -4(%esi,%ebp,4),%esi
79 movl (%esi),%edx
80 subl $4,%esi
81 xorl %eax,%eax
82 shldl( %cl, %edx, %eax) C compute carry limb
83 pushl %eax C push carry limb onto stack
85 decl %ebp
86 pushl %ebp
87 shrl $3,%ebp
88 jz L(end)
90 movl (%edi),%eax C fetch destination cache line
92 ALIGN(4)
93 L(oop): movl -28(%edi),%eax C fetch destination cache line
94 movl %edx,%ebx
96 movl (%esi),%eax
97 movl -4(%esi),%edx
98 shldl( %cl, %eax, %ebx)
99 shldl( %cl, %edx, %eax)
100 movl %ebx,(%edi)
101 movl %eax,-4(%edi)
103 movl -8(%esi),%ebx
104 movl -12(%esi),%eax
105 shldl( %cl, %ebx, %edx)
106 shldl( %cl, %eax, %ebx)
107 movl %edx,-8(%edi)
108 movl %ebx,-12(%edi)
110 movl -16(%esi),%edx
111 movl -20(%esi),%ebx
112 shldl( %cl, %edx, %eax)
113 shldl( %cl, %ebx, %edx)
114 movl %eax,-16(%edi)
115 movl %edx,-20(%edi)
117 movl -24(%esi),%eax
118 movl -28(%esi),%edx
119 shldl( %cl, %eax, %ebx)
120 shldl( %cl, %edx, %eax)
121 movl %ebx,-24(%edi)
122 movl %eax,-28(%edi)
124 subl $32,%esi
125 subl $32,%edi
126 decl %ebp
127 jnz L(oop)
129 L(end): popl %ebp
130 andl $7,%ebp
131 jz L(end2)
132 L(oop2):
133 movl (%esi),%eax
134 shldl( %cl,%eax,%edx)
135 movl %edx,(%edi)
136 movl %eax,%edx
137 subl $4,%esi
138 subl $4,%edi
139 decl %ebp
140 jnz L(oop2)
142 L(end2):
143 shll %cl,%edx C compute least significant limb
144 movl %edx,(%edi) C store it
146 popl %eax C pop carry limb
148 popl %ebp
149 popl %ebx
150 popl %esi
151 popl %edi
155 C We loop from least significant end of the arrays, which is only
156 C permissable if the source and destination don't overlap, since the
157 C function is documented to work for overlapping source and destination.
159 L(special):
160 movl (%esi),%edx
161 addl $4,%esi
163 decl %ebp
164 pushl %ebp
165 shrl $3,%ebp
167 addl %edx,%edx
168 incl %ebp
169 decl %ebp
170 jz L(Lend)
172 movl (%edi),%eax C fetch destination cache line
174 ALIGN(4)
175 L(Loop):
176 movl 28(%edi),%eax C fetch destination cache line
177 movl %edx,%ebx
179 movl (%esi),%eax
180 movl 4(%esi),%edx
181 adcl %eax,%eax
182 movl %ebx,(%edi)
183 adcl %edx,%edx
184 movl %eax,4(%edi)
186 movl 8(%esi),%ebx
187 movl 12(%esi),%eax
188 adcl %ebx,%ebx
189 movl %edx,8(%edi)
190 adcl %eax,%eax
191 movl %ebx,12(%edi)
193 movl 16(%esi),%edx
194 movl 20(%esi),%ebx
195 adcl %edx,%edx
196 movl %eax,16(%edi)
197 adcl %ebx,%ebx
198 movl %edx,20(%edi)
200 movl 24(%esi),%eax
201 movl 28(%esi),%edx
202 adcl %eax,%eax
203 movl %ebx,24(%edi)
204 adcl %edx,%edx
205 movl %eax,28(%edi)
207 leal 32(%esi),%esi C use leal not to clobber carry
208 leal 32(%edi),%edi
209 decl %ebp
210 jnz L(Loop)
212 L(Lend):
213 popl %ebp
214 sbbl %eax,%eax C save carry in %eax
215 andl $7,%ebp
216 jz L(Lend2)
217 addl %eax,%eax C restore carry from eax
218 L(Loop2):
219 movl %edx,%ebx
220 movl (%esi),%edx
221 adcl %edx,%edx
222 movl %ebx,(%edi)
224 leal 4(%esi),%esi C use leal not to clobber carry
225 leal 4(%edi),%edi
226 decl %ebp
227 jnz L(Loop2)
229 jmp L(L1)
230 L(Lend2):
231 addl %eax,%eax C restore carry from eax
232 L(L1): movl %edx,(%edi) C store last limb
234 sbbl %eax,%eax
235 negl %eax
237 popl %ebp
238 popl %ebx
239 popl %esi
240 popl %edi
243 EPILOGUE()