1 dnl AMD K6
-2 mpn_rshift
-- mpn right shift.
3 dnl Copyright
1999, 2000, 2002 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C K6-2: 1.75 cycles/limb
37 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
41 defframe(PARAM_SHIFT,16)
42 defframe(PARAM_SIZE, 12)
43 defframe(PARAM_SRC, 8)
44 defframe(PARAM_DST, 4)
47 dnl Minimum
9, because the unrolled
loop can
't handle less.
49 deflit(UNROLL_THRESHOLD, 9)
57 C The
1 limb case can be done without the
push %ebx, but it
's then
58 C still the same speed. The push is left as a free helping hand for
59 C the two_or_more code.
62 pushl %ebx FRAME_pushl()
67 movl PARAM_SHIFT, %ecx
70 movl (%ebx), %edx C src limb
73 shrdl( %cl, %edx, %eax) C return value
77 movl %edx, (%ebx) C dst limb
83 C -----------------------------------------------------------------------------
84 ALIGN(16) C avoid offset 0x1f
91 movl (%ebx), %edx C src low limb
95 movd PARAM_SHIFT, %mm6
98 cmpl $UNROLL_THRESHOLD-1, %eax
111 leal (%ebx,%eax,4), %ebx
113 leal -4(%ecx,%eax,4), %ecx
116 C This loop runs at about 3 cycles/limb, which is the amount of
117 C decoding, and this is despite every second access being unaligned.
120 C eax counter, -(size-1) to -1
128 Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
133 Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
146 C -----------------------------------------------------------------------------
157 subl $7, %eax C size-8
162 movq (%ebx), %mm2 C src low qword
163 leal (%ebx,%eax,4), %ebx C src end - 32
166 leal (%ecx,%eax,4), %ecx C dst end - 32
168 notl %eax C -(size-7)
174 Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb
175 movq 4(%ebx,%eax,4), %mm2 C new src low qword
178 movq 12(%ebx,%eax,4), %mm0 C src second lowest qword
179 nop C avoid bad cache line crossing
182 C This loop is the important bit, the rest is just support for it.
183 C Four src limbs are held at the start, and four more will be read.
184 C Four dst limbs will be written. This schedule seems necessary for
187 C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
188 C and leaves 0 to 3 which can be tested with test $1 and $2.
191 C eax counter, -(size-7) step by +4 until >=0
209 movq 4(%ebx,%eax,4), %mm0
212 movq %mm2, -12(%ecx,%eax,4)
218 movq 12(%ebx,%eax,4), %mm0
220 movq %mm1, -4(%ecx,%eax,4)
221 ja L(top) C jump if no carry and not zero
225 C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
226 C to 3 representing respectively 3 to 0 further limbs.
228 testl $2, %eax C testl to avoid bad cache line crossings
231 C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
232 C becomes new mm2 and a new mm0 is loaded.
241 movq 12(%ebx,%eax,4), %mm0
243 movq %mm2, -4(%ecx,%eax,4)
257 movq %mm2, 4(%ecx,%eax,4)
261 C one further extra limb to process
263 movd 32-4(%ebx), %mm0 C src[size-1], most significant limb
272 movq %mm1, 32-12(%ecx) C dst[size-3,size-2]
273 movd %mm2, 32-4(%ecx) C dst[size-1]
275 movl %edx, %eax C retval
281 nop C avoid bad cache line crossing
283 C no further extra limbs
285 movq %mm1, 32-8(%ecx) C dst[size-2,size-1]
286 movl %edx, %eax C retval