1 dnl AMD K7 mpn_rshift
-- mpn right shift.
3 dnl Copyright
1999-2002 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C K7: 1.21 cycles/limb (at 16 limbs/loop).
38 dnl K7: UNROLL_COUNT cycles/limb
43 dnl Maximum possible with the current code is 64.
45 deflit(UNROLL_COUNT, 16)
48 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
51 C Shift src,size right by shift many bits and store the result in dst,size.
52 C Zeros are shifted in at the left. The bits shifted out at the right are
55 C This code uses 64-bit MMX operations, which makes it possible to handle
56 C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer
57 C code, on the other hand, suffers from shrd being a vector path decode and
58 C running at 3 cycles back-to-back.
60 C Full speed depends on source and destination being aligned, and some hairy
61 C setups and finish-ups are done to arrange this for the loop.
64 deflit
(UNROLL_THRESHOLD
, 10)
66 deflit(UNROLL_THRESHOLD, 10)
69 defframe
(PARAM_SHIFT
,16)
70 defframe
(PARAM_SIZE
, 12)
71 defframe
(PARAM_SRC
, 8)
72 defframe
(PARAM_DST
, 4)
74 defframe
(SAVE_EDI
, -4)
75 defframe
(SAVE_ESI
, -8)
76 defframe
(SAVE_EBX
, -12)
88 deflit(`FRAME',SAVE_SIZE
)
90 movl PARAM_SHIFT
, %ecx
95 jnz L
(more_than_one_limb
)
97 movl
(%edx), %edx C src limb
99 shrdl
( %cl, %edx, %eax) C
eax was decremented to zero
103 movl
%edx, (%edi) C dst limb
105 addl $SAVE_SIZE
, %esp
110 C
-----------------------------------------------------------------------------
111 L
(more_than_one_limb
):
120 movd PARAM_SHIFT
, %mm6 C rshift
121 movd
(%edx), %mm5 C src
low limb
122 cmp $UNROLL_THRESHOLD
-1, %eax
125 leal
(%edx,%eax,4), %edx C
&src
[size-1]
126 leal
-4(%edi,%eax,4), %edi C
&dst
[size-2]
128 movd
(%edx), %mm4 C src
high limb
133 C
eax loop counter
, limbs
, negative
146 movq
(%edx,%eax,4), %mm0
151 movd
%mm0
, (%edi,%eax,4)
159 movd
%mm4
, 4(%edi) C dst
high limb
161 movd
%mm5
, %eax C return value
164 addl $SAVE_SIZE
, %esp
170 C
-----------------------------------------------------------------------------
189 jz L
(start_src_aligned
)
192 C src isn
't aligned, process low limb separately (marked xxx) and
193 C step src and dst by one limb, making src aligned.
196 C --+-------+-------+-------+
198 C --+-------+-------+-------+
202 C --+-------+-------+
204 C --+-------+-------+
206 movq (%edx), %mm0 C src low two limbs
208 movl %eax, PARAM_SIZE C size-1
211 decl %eax C size-2 is new size-1
214 movl %edi, PARAM_DST C new dst
217 L(start_src_aligned):
220 movq (%edx), %mm1 C src low two limbs
221 decl %eax C size-2, two last limbs handled at end
225 jz L(start_dst_aligned)
228 C dst isn't aligned
, add 4 to make it so
, and pretend the shift is
229 C
32 bits extra.
Low limb of dst
(marked xxx
) handled here separately.
232 C
--+-------+-------+
234 C
--+-------+-------+
238 C
--+-------+-------+-------+
240 C
--+-------+-------+-------+
245 addl
$32, %ecx C shift
+32
249 addl
$4, %edi C new dst
252 L
(start_dst_aligned
):
255 movq
%mm1
, %mm2 C copy of src
low two limbs
257 andl $
-2, %eax C round
size down to even
263 andl $UNROLL_MASK
, %eax
268 movd
%ecx, %mm7 C lshift
= 64-rshift
274 leal L
(entry) (%eax,%eax,4), %esi
277 shrl $UNROLL_LOG2, %ebx C loop counter
279 leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
280 leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
281 movl PARAM_SIZE, %eax C for use at end
288 C See mpn
/x86
/README about old gas bugs
289 leal
(%eax,%eax,4), %esi
290 addl $L
(entry)-L
(here
), %esi
298 C -----------------------------------------------------------------------------
301 C eax size, for use at end
305 C esi was computed jump
310 C mm1 \ carry (alternating)
317 C The two chunks differ in whether mm1 or mm2 hold the carry.
318 C The computed jump puts the initial carry in both mm1 and mm2.
321 deflit(CHUNK_COUNT, 4)
322 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
323 deflit(`disp0', eval
(i
*CHUNK_COUNT
*4 ifelse
(UNROLL_BYTES
,256,-128)))
324 deflit
(`disp1
', eval(disp0 + 8))
326 Zdisp( movq, disp0,(%edx), %mm0)
333 Zdisp( movq, %mm0, disp0,(%edi))
336 Zdisp( movq, disp1,(%edx), %mm0)
343 Zdisp( movq, %mm0, disp1,(%edi))
346 addl $UNROLL_BYTES
, %edx
347 addl $UNROLL_BYTES
, %edi
353 deflit
(`disp0
', ifelse(UNROLL_BYTES,256,-128))
354 deflit(`disp1', eval
(disp0
-0 + 8))
357 psrlq
%mm6
, %mm2 C wanted rshifted
in all cases below
360 movd
%mm5
, %eax C return value
366 C
Size odd
, destination was aligned.
370 C
+-------+---------------+--
372 C
+-------+---------------+--
375 C
+-------+---------------+---------------+--
377 C
+-------+---------------+---------------+--
380 C mm7
= ecx = 64-shift
383 C
Size odd
, destination was unaligned.
387 C
+-------+---------------+--
389 C
+-------+---------------+--
392 C
+---------------+---------------+--
394 C
+---------------+---------------+--
397 C mm7
= ecx = 64-(shift
+32)
400 C
In both cases there
's one extra limb of src to fetch and combine
401 C with mm2 to make a qword to store, and in the aligned case there's
402 C a further extra limb of dst to be formed.
405 movd disp0
(%edx), %mm0
414 movq
%mm0
, disp0
(%edi)
415 jz L
(finish_odd_unaligned
)
417 movd
%mm1
, disp1
(%edi)
418 L
(finish_odd_unaligned
):
421 addl $SAVE_SIZE
, %esp
429 C
Size even
, destination was aligned.
432 C
+---------------+--
434 C
+---------------+--
437 C
+---------------+---------------+--
439 C
+---------------+---------------+--
442 C mm7
= ecx = 64-shift
445 C
Size even
, destination was unaligned.
448 C
+---------------+--
450 C
+---------------+--
453 C
+-------+---------------+--
455 C
+-------+---------------+--
458 C mm7
= 64-(shift
+32)
461 C The movd for the unaligned case is the same data as the movq for
462 C the aligned case
, it
's just a choice between whether one or two
463 C limbs should be written.
467 movd %mm2, disp0(%edi)
469 jz L(end_even_unaligned)
471 movq %mm2, disp0(%edi)
472 L(end_even_unaligned):
475 addl $SAVE_SIZE, %esp