1 dnl Intel P5 mpn_lshift
-- mpn left shift.
3 dnl Copyright
2000-2002 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C P5: 1.75 cycles/limb.
37 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
40 C Shift src,size left by shift many bits and store the result in dst,size.
41 C Zeros are shifted in at the right. Return the bits shifted out at the
44 C The comments in mpn_rshift apply here too.
46 defframe(PARAM_SHIFT,16)
47 defframe(PARAM_SIZE, 12)
48 defframe(PARAM_SRC, 8)
49 defframe(PARAM_DST, 4)
52 dnl minimum
5, because the unrolled
loop can
't handle less
53 deflit(UNROLL_THRESHOLD, 5)
68 movl PARAM_SHIFT
, %ecx
70 cmp $UNROLL_THRESHOLD
, %eax
73 movl
-4(%ebx,%eax,4), %edi C src
high limb
78 shldl
( %cl, %edi, %eax) C
eax was decremented to zero
82 movl
%edi, (%edx) C dst
low limb
83 popl
%edi C risk of data cache bank clash
90 C
-----------------------------------------------------------------------------
101 movd (%ebx,%eax,4), %mm5 C src high limb
103 movd %ecx, %mm6 C lshift
110 psrlq $32, %mm5 C retval
114 C eax counter, limbs, negative
126 movq -4(%ebx,%eax,4), %mm0
133 movd %mm0, 4(%edx,%eax,4)
152 C -----------------------------------------------------------------------------
164 movd
-4(%ebx,%eax,4), %mm5 C src
high limb
165 leal
(%ebx,%eax,4), %edi
167 movd
%ecx, %mm6 C lshift
171 jz L
(start_src_aligned
)
174 C src isn
't aligned, process high limb separately (marked xxx) to
177 C source -8(ebx,%eax,4)
179 C +-------+-------+-------+--
181 C +-------+-------+-------+--
187 C +-------+-------+--
189 C +-------+-------+--
191 movq -8(%ebx,%eax,4), %mm0 C unaligned load
200 movd %mm0, (%edx,%eax,4)
201 L(start_src_aligned):
203 movq -8(%ebx,%eax,4), %mm1 C src high qword
204 leal (%edx,%eax,4), %edi
207 psrlq $32, %mm5 C return value
209 movq -16(%ebx,%eax,4), %mm3 C src second highest qword
210 jz L(start_dst_aligned)
212 C dst isn't aligned
, subtract
4 to make it so
, and pretend the shift
213 C is
32 bits extra.
High limb of dst
(marked xxx
) handled here
216 C source
-8(ebx,%eax,4)
218 C
+-------+-------+--
220 C
+-------+-------+--
226 C
+-------+-------+-------+--
228 C
+-------+-------+-------+--
232 addl
$32, %ecx C new shift
239 C wasted cycle here waiting for
%mm0
241 movd
%mm0
, -4(%edx,%eax,4)
243 L
(start_dst_aligned
):
249 addl
$64, %ecx C
64-shift
253 subl
$8, %eax C
size-8
257 por
%mm1
, %mm3 C mm3 ready to store
261 C The comments
in mpn_rshift apply here too.
274 C mm2 src
qword from
16(%ebx,%eax,4)
275 C mm3 dst
qword ready to store to
24(%edx,%eax,4)
281 movq
8(%ebx,%eax,4), %mm0
287 movq
%mm3
, 24(%edx,%eax,4) C prev
290 movq
(%ebx,%eax,4), %mm3 C
293 movq
%mm0
, 16(%edx,%eax,4)
305 C
eax -4 to
-1 representing respectively
0 to
3 limbs remaining
311 movq
8(%ebx,%eax,4), %mm0
317 movq
%mm3
, 24(%edx,%eax,4) C prev
327 C
eax -4 or -3 representing respectively
0 or 1 limbs remaining
329 C mm2 src prev
qword, from
16(%ebx,%eax,4)
330 C mm3 dst
qword, for
24(%edx,%eax,4)
333 movd
%mm5
, %eax C retval
339 C One extra src limb
, destination was aligned.
342 C
--+---------------+-------+
344 C
--+---------------+-------+
346 C dest
edx+12 edx+4 edx
347 C
--+---------------+---------------+-------+
349 C
--+---------------+---------------+-------+
352 C mm7
= ecx = 64-shift
355 C One extra src limb
, destination was unaligned.
358 C
--+---------------+-------+
360 C
--+---------------+-------+
363 C
--+---------------+---------------+
365 C
--+---------------+---------------+
368 C mm7
= ecx = 64-(shift
+32)
371 C
In both cases there
's one extra limb of src to fetch and combine
372 C with mm2 to make a qword at 4(%edx), and in the aligned case
373 C there's an extra limb of dst to be formed from that extra src limb
395 jz L
(finish_one_unaligned
)
398 L
(finish_one_unaligned
):
407 C No extra src limbs
, destination was aligned.
410 C
--+---------------+
412 C
--+---------------+
415 C
--+---------------+---------------+
417 C
--+---------------+---------------+
420 C mm7
= ecx = 64-shift
423 C No extra src limbs
, destination was unaligned.
426 C
--+---------------+
428 C
--+---------------+
431 C
--+---------------+-------+
433 C
--+---------------+-------+
436 C mm7
= ecx = 64-(shift
+32)
439 C The movd for the unaligned case writes the same data to
4(%edx)
440 C that the movq does for the aligned case.
447 jz L
(finish_zero_unaligned
)
450 L
(finish_zero_unaligned
):
455 movd
%mm5
, %eax C retval