beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k6 / k62mmx / lshift.asm
blobc86575feed9b7117fd170ae3550d58e1770c6353
1 dnl AMD K6-2 mpn_lshift -- mpn left shift.
3 dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C K6-2: 1.75 cycles/limb
37 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
38 C unsigned shift);
41 defframe(PARAM_SHIFT,16)
42 defframe(PARAM_SIZE, 12)
43 defframe(PARAM_SRC, 8)
44 defframe(PARAM_DST, 4)
45 deflit(`FRAME',0)
47 dnl used after src has been fetched
48 define(VAR_RETVAL,`PARAM_SRC')
50 dnl minimum 9, because unrolled loop can't handle less
51 deflit(UNROLL_THRESHOLD, 9)
53 TEXT
54 ALIGN(32)
56 PROLOGUE(mpn_lshift)
57 deflit(`FRAME',0)
59 C The 1 limb case can be done without the push %ebx, but it's then
60 C still the same speed. The push is left as a free helping hand for
61 C the two_or_more code.
63 movl PARAM_SIZE, %eax
64 pushl %ebx FRAME_pushl()
66 movl PARAM_SRC, %ebx
67 decl %eax
69 movl PARAM_SHIFT, %ecx
70 jnz L(two_or_more)
72 movl (%ebx), %edx C src limb
73 movl PARAM_DST, %ebx
75 shldl( %cl, %edx, %eax) C return value
77 shll %cl, %edx
79 movl %edx, (%ebx) C dst limb
80 popl %ebx
82 ret
85 C -----------------------------------------------------------------------------
86 ALIGN(16) C avoid offset 0x1f
87 L(two_or_more):
88 C eax size-1
89 C ebx src
90 C ecx shift
91 C edx
93 movl (%ebx,%eax,4), %edx C src high limb
94 negl %ecx
96 movd PARAM_SHIFT, %mm6
97 addl $32, %ecx C 32-shift
99 shrl %cl, %edx
100 cmpl $UNROLL_THRESHOLD-1, %eax
102 movl %edx, VAR_RETVAL
103 jae L(unroll)
106 movd %ecx, %mm7
107 movl %eax, %ecx
109 movl PARAM_DST, %eax
111 L(simple):
112 C eax dst
113 C ebx src
114 C ecx counter, size-1 to 1
115 C edx retval
117 C mm0 scratch
118 C mm6 shift
119 C mm7 32-shift
121 movq -4(%ebx,%ecx,4), %mm0
123 psrlq %mm7, %mm0
125 Zdisp( movd, %mm0, 0,(%eax,%ecx,4))
126 loop L(simple)
129 movd (%ebx), %mm0
130 popl %ebx
132 psllq %mm6, %mm0
134 movd %mm0, (%eax)
135 movl %edx, %eax
137 femms
141 C -----------------------------------------------------------------------------
142 ALIGN(16)
143 L(unroll):
144 C eax size-1
145 C ebx src
146 C ecx 32-shift
147 C edx retval (but instead VAR_RETVAL is used)
149 C mm6 shift
151 addl $32, %ecx
152 movl PARAM_DST, %edx
154 movd %ecx, %mm7
155 subl $7, %eax C size-8
157 leal (%edx,%eax,4), %ecx C alignment of dst
159 movq 32-8(%ebx,%eax,4), %mm2 C src high qword
160 testb $4, %cl
162 jz L(dst_aligned)
163 psllq %mm6, %mm2
165 psrlq $32, %mm2
166 decl %eax
168 movd %mm2, 32(%edx,%eax,4) C dst high limb
169 movq 32-8(%ebx,%eax,4), %mm2 C new src high qword
170 L(dst_aligned):
172 movq 32-16(%ebx,%eax,4), %mm0 C src second highest qword
175 C This loop is the important bit, the rest is just support for it.
176 C Four src limbs are held at the start, and four more will be read.
177 C Four dst limbs will be written. This schedule seems necessary for
178 C full speed.
180 C The use of size-8 lets the loop stop when %eax goes negative and
181 C leaves -4 to -1 which can be tested with test $1 and $2.
183 L(top):
184 C eax counter, size-8 step by -4 until <0
185 C ebx src
186 C ecx
187 C edx dst
189 C mm0 src next qword
190 C mm1 scratch
191 C mm2 src prev qword
192 C mm6 shift
193 C mm7 64-shift
195 psllq %mm6, %mm2
196 subl $4, %eax
198 movq %mm0, %mm1
199 psrlq %mm7, %mm0
201 por %mm0, %mm2
202 movq 24(%ebx,%eax,4), %mm0
204 psllq %mm6, %mm1
205 movq %mm2, 40(%edx,%eax,4)
207 movq %mm0, %mm2
208 psrlq %mm7, %mm0
210 por %mm0, %mm1
211 movq 16(%ebx,%eax,4), %mm0
213 movq %mm1, 32(%edx,%eax,4)
214 jnc L(top)
217 C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
219 C 8(%ebx) is the next source, and 24(%edx) is the next destination.
220 C %eax is between -4 and -1, representing respectively 0 to 3 extra
221 C limbs that must be read.
224 testl $2, %eax C testl to avoid bad cache line crossing
225 jz L(finish_nottwo)
227 C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
228 C new mm2 and a new mm0 is loaded.
230 psllq %mm6, %mm2
231 movq %mm0, %mm1
233 psrlq %mm7, %mm0
234 subl $2, %eax
236 por %mm0, %mm2
237 movq 16(%ebx,%eax,4), %mm0
239 movq %mm2, 32(%edx,%eax,4)
240 movq %mm1, %mm2
241 L(finish_nottwo):
244 C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
246 testb $1, %al
247 psllq %mm6, %mm2
249 movq %mm0, %mm1
250 psrlq %mm7, %mm0
252 por %mm0, %mm2
253 psllq %mm6, %mm1
255 movq %mm2, 24(%edx,%eax,4)
256 jz L(finish_even)
259 C Size is odd, so mm1 and one extra limb to process.
261 movd (%ebx), %mm0 C src[0]
262 popl %ebx
263 deflit(`FRAME',0)
265 movq %mm0, %mm2
266 psllq $32, %mm0
268 psrlq %mm7, %mm0
270 psllq %mm6, %mm2
271 por %mm0, %mm1
273 movq %mm1, 4(%edx) C dst[1,2]
274 movd %mm2, (%edx) C dst[0]
276 movl VAR_RETVAL, %eax
278 femms
282 nop C avoid bad cache line crossing
283 L(finish_even):
284 deflit(`FRAME',4)
285 C Size is even, so only mm1 left to process.
287 movq %mm1, (%edx) C dst[0,1]
288 movl VAR_RETVAL, %eax
290 popl %ebx
291 femms
294 EPILOGUE()