1 dnl AMD K7 mpn_lshift
-- mpn left shift.
3 dnl Copyright
1999-2002 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C K7: 1.21 cycles/limb (at 16 limbs/loop).
38 dnl K7: UNROLL_COUNT cycles/limb
43 dnl Maximum possible with the current code is 64.
45 deflit(UNROLL_COUNT, 16)
48 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
51 C Shift src,size left by shift many bits and store the result in dst,size.
52 C Zeros are shifted in at the right. The bits shifted out at the left are
55 C The comments in mpn_rshift apply here too.
58 deflit
(UNROLL_THRESHOLD
, 10)
60 deflit(UNROLL_THRESHOLD, 10)
63 defframe
(PARAM_SHIFT
,16)
64 defframe
(PARAM_SIZE
, 12)
65 defframe
(PARAM_SRC
, 8)
66 defframe
(PARAM_DST
, 4)
68 defframe
(SAVE_EDI
, -4)
69 defframe
(SAVE_ESI
, -8)
70 defframe
(SAVE_EBX
, -12)
82 deflit(`FRAME',SAVE_SIZE
)
84 movl PARAM_SHIFT
, %ecx
89 jnz L
(more_than_one_limb
)
93 shldl
( %cl, %edx, %eax) C
eax was decremented to zero
104 C
-----------------------------------------------------------------------------
105 L
(more_than_one_limb
):
114 movd PARAM_SHIFT
, %mm6
115 movd
(%edx,%eax,4), %mm5 C src
high limb
116 cmp $UNROLL_THRESHOLD
-1, %eax
120 movd
(%edx), %mm4 C src
low limb
127 C
eax loop counter
, limbs
141 movq
-4(%edx,%eax,4), %mm0
146 movd
%mm0
, 4(%edi,%eax,4)
154 movd
%mm4
, (%edi) C dst
low limb
156 movd
%mm5
, %eax C return value
159 addl $SAVE_SIZE
, %esp
165 C
-----------------------------------------------------------------------------
176 C mm5 src
high limb
, for return value
181 leal
-4(%edx,%eax,4), %edx C
&src
[size-2]
184 movq
(%edx), %mm1 C src
high qword
186 jz L
(start_src_aligned
)
189 C src isn
't aligned, process high limb (marked xxx) separately to
192 C source -4(edx,%eax,4)
194 C +-------+-------+-------+--
196 C +-------+-------+-------+--
199 C dest -4(edi,%eax,4)
201 C +-------+-------+--
203 C +-------+-------+--
207 movl %eax, PARAM_SIZE C size-1
210 decl %eax C size-2 is new size-1
212 movd %mm1, 4(%edi,%eax,4)
213 movq (%edx), %mm1 C new src high qword
214 L(start_src_aligned):
217 leal -4(%edi,%eax,4), %edi C &dst[size-2]
221 psrlq $32, %mm5 C return value
223 jz L(start_dst_aligned)
226 C dst isn't aligned
, subtract
4 bytes to make it so
, and pretend the
227 C shift is
32 bits extra.
High limb of dst
(marked xxx
) handled
231 C
+-------+-------+--
233 C
+-------+-------+--
237 C
+-------+-------+-------+--
239 C
+-------+-------+-------+--
244 addl
$32, %ecx C shift
+32
252 movd
%ecx, %mm6 C new lshift
253 L
(start_dst_aligned
):
255 decl
%eax C
size-2, two last limbs handled at
end
256 movq
%mm1
, %mm2 C copy of src
high qword
259 andl $
-2, %eax C round
size down to even
265 andl $UNROLL_MASK
, %eax
270 movd
%ecx, %mm7 C rshift
= 64-lshift
276 leal L
(entry) (%eax,%eax,4), %esi
278 shrl $UNROLL_LOG2, %ebx C loop counter
280 leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
281 leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
282 movl PARAM_SIZE, %eax C for use at end
288 C See mpn
/x86
/README about old gas bugs
289 leal
(%eax,%eax,4), %esi
290 addl $L
(entry)-L
(here
), %esi
297 C -----------------------------------------------------------------------------
300 C eax size (for use at end)
309 C mm1 \ carry (alternating, mm2 first)
316 C The two chunks differ in whether mm1 or mm2 hold the carry.
317 C The computed jump puts the initial carry in both mm1 and mm2.
320 deflit(CHUNK_COUNT, 4)
321 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
322 deflit(`disp0', eval
(-i
*CHUNK_COUNT
*4 ifelse
(UNROLL_BYTES
,256,-128)))
323 deflit
(`disp1
', eval(disp0 - 8))
325 Zdisp( movq, disp0,(%edx), %mm0)
332 Zdisp( movq, %mm0, disp0,(%edi))
335 Zdisp( movq, disp1,(%edx), %mm0)
342 Zdisp( movq, %mm0, disp1,(%edi))
345 subl $UNROLL_BYTES
, %edx
346 subl $UNROLL_BYTES
, %edi
353 define
(`disp
', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
358 psllq
%mm6
, %mm2 C wanted left shifted
in all cases below
368 C
Size odd
, destination was aligned.
371 C
--+---------------+-------+
373 C
--+---------------+-------+
376 C
--+---------------+---------------+-------+
378 C
--+---------------+---------------+-------+
381 C mm7
= ecx = 64-shift
384 C
Size odd
, destination was unaligned.
387 C
--+---------------+-------+
389 C
--+---------------+-------+
392 C
--+---------------+---------------+
394 C
--+---------------+---------------+
397 C mm7
= ecx = 64-(shift
+32)
400 C
In both cases there
's one extra limb of src to fetch and combine
401 C with mm2 to make a qword at (%edi), and in the aligned case
402 C there's an extra limb of dst to be formed from that extra src limb
405 movd disp
(4) (%edx), %mm0
416 movq
%mm0
, disp
(0) (%edi)
417 jz L
(end_odd_unaligned
)
418 movd
%mm1
, disp
(-4) (%edi)
419 L
(end_odd_unaligned
):
422 addl $SAVE_SIZE
, %esp
430 C
Size even
, destination was aligned.
433 C
--+---------------+
435 C
--+---------------+
438 C
--+---------------+---------------+
440 C
--+---------------+---------------+
443 C mm7
= ecx = 64-shift
446 C
Size even
, destination was unaligned.
449 C
--+---------------+
451 C
--+---------------+
454 C
--+---------------+-------+
456 C
--+---------------+-------+
459 C mm7
= ecx = 64-(shift
+32)
462 C The movq for the aligned case overwrites the movd for the
469 movd
%mm2
, disp
(4) (%edi)
471 jz L
(end_even_unaligned
)
472 movq
%mm0
, disp
(0) (%edi)
473 L
(end_even_unaligned
):
476 addl $SAVE_SIZE
, %esp