beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium / rshift.asm
blob2105c4c93528f5f03bf32437ac9a56b9dbd1d469
1 dnl Intel Pentium mpn_rshift -- mpn right shift.
3 dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C P5,P54: 6.0
36 C P55: 5.375
39 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
40 C unsigned shift);
42 C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
43 C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
45 defframe(PARAM_SHIFT,16)
46 defframe(PARAM_SIZE, 12)
47 defframe(PARAM_SRC, 8)
48 defframe(PARAM_DST, 4)
50 TEXT
51 ALIGN(8)
52 PROLOGUE(mpn_rshift)
54 pushl %edi
55 pushl %esi
56 pushl %ebx
57 pushl %ebp
58 deflit(`FRAME',16)
60 movl PARAM_DST,%edi
61 movl PARAM_SRC,%esi
62 movl PARAM_SIZE,%ebp
63 movl PARAM_SHIFT,%ecx
65 C We can use faster code for shift-by-1 under certain conditions.
66 cmp $1,%ecx
67 jne L(normal)
68 leal 4(%edi),%eax
69 cmpl %esi,%eax
70 jnc L(special) C jump if res_ptr + 1 >= s_ptr
71 leal (%edi,%ebp,4),%eax
72 cmpl %eax,%esi
73 jnc L(special) C jump if s_ptr >= res_ptr + size
75 L(normal):
76 movl (%esi),%edx
77 addl $4,%esi
78 xorl %eax,%eax
79 shrdl( %cl, %edx, %eax) C compute carry limb
80 pushl %eax C push carry limb onto stack
82 decl %ebp
83 pushl %ebp
84 shrl $3,%ebp
85 jz L(end)
87 movl (%edi),%eax C fetch destination cache line
89 ALIGN(4)
90 L(oop): movl 28(%edi),%eax C fetch destination cache line
91 movl %edx,%ebx
93 movl (%esi),%eax
94 movl 4(%esi),%edx
95 shrdl( %cl, %eax, %ebx)
96 shrdl( %cl, %edx, %eax)
97 movl %ebx,(%edi)
98 movl %eax,4(%edi)
100 movl 8(%esi),%ebx
101 movl 12(%esi),%eax
102 shrdl( %cl, %ebx, %edx)
103 shrdl( %cl, %eax, %ebx)
104 movl %edx,8(%edi)
105 movl %ebx,12(%edi)
107 movl 16(%esi),%edx
108 movl 20(%esi),%ebx
109 shrdl( %cl, %edx, %eax)
110 shrdl( %cl, %ebx, %edx)
111 movl %eax,16(%edi)
112 movl %edx,20(%edi)
114 movl 24(%esi),%eax
115 movl 28(%esi),%edx
116 shrdl( %cl, %eax, %ebx)
117 shrdl( %cl, %edx, %eax)
118 movl %ebx,24(%edi)
119 movl %eax,28(%edi)
121 addl $32,%esi
122 addl $32,%edi
123 decl %ebp
124 jnz L(oop)
126 L(end): popl %ebp
127 andl $7,%ebp
128 jz L(end2)
129 L(oop2):
130 movl (%esi),%eax
131 shrdl( %cl,%eax,%edx) C compute result limb
132 movl %edx,(%edi)
133 movl %eax,%edx
134 addl $4,%esi
135 addl $4,%edi
136 decl %ebp
137 jnz L(oop2)
139 L(end2):
140 shrl %cl,%edx C compute most significant limb
141 movl %edx,(%edi) C store it
143 popl %eax C pop carry limb
145 popl %ebp
146 popl %ebx
147 popl %esi
148 popl %edi
152 C We loop from least significant end of the arrays, which is only
153 C permissable if the source and destination don't overlap, since the
154 C function is documented to work for overlapping source and destination.
156 L(special):
157 leal -4(%edi,%ebp,4),%edi
158 leal -4(%esi,%ebp,4),%esi
160 movl (%esi),%edx
161 subl $4,%esi
163 decl %ebp
164 pushl %ebp
165 shrl $3,%ebp
167 shrl %edx
168 incl %ebp
169 decl %ebp
170 jz L(Lend)
172 movl (%edi),%eax C fetch destination cache line
174 ALIGN(4)
175 L(Loop):
176 movl -28(%edi),%eax C fetch destination cache line
177 movl %edx,%ebx
179 movl (%esi),%eax
180 movl -4(%esi),%edx
181 rcrl %eax
182 movl %ebx,(%edi)
183 rcrl %edx
184 movl %eax,-4(%edi)
186 movl -8(%esi),%ebx
187 movl -12(%esi),%eax
188 rcrl %ebx
189 movl %edx,-8(%edi)
190 rcrl %eax
191 movl %ebx,-12(%edi)
193 movl -16(%esi),%edx
194 movl -20(%esi),%ebx
195 rcrl %edx
196 movl %eax,-16(%edi)
197 rcrl %ebx
198 movl %edx,-20(%edi)
200 movl -24(%esi),%eax
201 movl -28(%esi),%edx
202 rcrl %eax
203 movl %ebx,-24(%edi)
204 rcrl %edx
205 movl %eax,-28(%edi)
207 leal -32(%esi),%esi C use leal not to clobber carry
208 leal -32(%edi),%edi
209 decl %ebp
210 jnz L(Loop)
212 L(Lend):
213 popl %ebp
214 sbbl %eax,%eax C save carry in %eax
215 andl $7,%ebp
216 jz L(Lend2)
217 addl %eax,%eax C restore carry from eax
218 L(Loop2):
219 movl %edx,%ebx
220 movl (%esi),%edx
221 rcrl %edx
222 movl %ebx,(%edi)
224 leal -4(%esi),%esi C use leal not to clobber carry
225 leal -4(%edi),%edi
226 decl %ebp
227 jnz L(Loop2)
229 jmp L(L1)
230 L(Lend2):
231 addl %eax,%eax C restore carry from eax
232 L(L1): movl %edx,(%edi) C store last limb
234 movl $0,%eax
235 rcrl %eax
237 popl %ebp
238 popl %ebx
239 popl %esi
240 popl %edi
243 EPILOGUE()