beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k6 / k62mmx / rshift.asm
blobf604a7bd52910cc16c495fa1c2fbb7bc6ac21ffc
1 dnl AMD K6-2 mpn_rshift -- mpn right shift.
3 dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C K6-2: 1.75 cycles/limb
37 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
38 C unsigned shift);
41 defframe(PARAM_SHIFT,16)
42 defframe(PARAM_SIZE, 12)
43 defframe(PARAM_SRC, 8)
44 defframe(PARAM_DST, 4)
45 deflit(`FRAME',0)
47 dnl Minimum 9, because the unrolled loop can't handle less.
48 dnl
49 deflit(UNROLL_THRESHOLD, 9)
51 TEXT
52 ALIGN(32)
54 PROLOGUE(mpn_rshift)
55 deflit(`FRAME',0)
57 C The 1 limb case can be done without the push %ebx, but it's then
58 C still the same speed. The push is left as a free helping hand for
59 C the two_or_more code.
61 movl PARAM_SIZE, %eax
62 pushl %ebx FRAME_pushl()
64 movl PARAM_SRC, %ebx
65 decl %eax
67 movl PARAM_SHIFT, %ecx
68 jnz L(two_or_more)
70 movl (%ebx), %edx C src limb
71 movl PARAM_DST, %ebx
73 shrdl( %cl, %edx, %eax) C return value
75 shrl %cl, %edx
77 movl %edx, (%ebx) C dst limb
78 popl %ebx
80 ret
83 C -----------------------------------------------------------------------------
84 ALIGN(16) C avoid offset 0x1f
85 L(two_or_more):
86 C eax size-1
87 C ebx src
88 C ecx shift
89 C edx
91 movl (%ebx), %edx C src low limb
92 negl %ecx
94 addl $32, %ecx
95 movd PARAM_SHIFT, %mm6
97 shll %cl, %edx
98 cmpl $UNROLL_THRESHOLD-1, %eax
100 jae L(unroll)
103 C eax size-1
104 C ebx src
105 C ecx 32-shift
106 C edx retval
108 C mm6 shift
110 movl PARAM_DST, %ecx
111 leal (%ebx,%eax,4), %ebx
113 leal -4(%ecx,%eax,4), %ecx
114 negl %eax
116 C This loop runs at about 3 cycles/limb, which is the amount of
117 C decoding, and this is despite every second access being unaligned.
119 L(simple):
120 C eax counter, -(size-1) to -1
121 C ebx &src[size-1]
122 C ecx &dst[size-1]
123 C edx retval
125 C mm0 scratch
126 C mm6 shift
128 Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
129 incl %eax
131 psrlq %mm6, %mm0
133 Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
134 jnz L(simple)
137 movq %mm0, (%ecx)
138 movl %edx, %eax
140 popl %ebx
142 femms
146 C -----------------------------------------------------------------------------
147 ALIGN(16)
148 L(unroll):
149 C eax size-1
150 C ebx src
151 C ecx 32-shift
152 C edx retval
154 C mm6 shift
156 addl $32, %ecx
157 subl $7, %eax C size-8
159 movd %ecx, %mm7
160 movl PARAM_DST, %ecx
162 movq (%ebx), %mm2 C src low qword
163 leal (%ebx,%eax,4), %ebx C src end - 32
165 testb $4, %cl
166 leal (%ecx,%eax,4), %ecx C dst end - 32
168 notl %eax C -(size-7)
169 jz L(dst_aligned)
171 psrlq %mm6, %mm2
172 incl %eax
174 Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb
175 movq 4(%ebx,%eax,4), %mm2 C new src low qword
176 L(dst_aligned):
178 movq 12(%ebx,%eax,4), %mm0 C src second lowest qword
179 nop C avoid bad cache line crossing
182 C This loop is the important bit, the rest is just support for it.
183 C Four src limbs are held at the start, and four more will be read.
184 C Four dst limbs will be written. This schedule seems necessary for
185 C full speed.
187 C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
188 C and leaves 0 to 3 which can be tested with test $1 and $2.
190 L(top):
191 C eax counter, -(size-7) step by +4 until >=0
192 C ebx src end - 32
193 C ecx dst end - 32
194 C edx retval
196 C mm0 src next qword
197 C mm1 scratch
198 C mm2 src prev qword
199 C mm6 shift
200 C mm7 64-shift
202 psrlq %mm6, %mm2
203 addl $4, %eax
205 movq %mm0, %mm1
206 psllq %mm7, %mm0
208 por %mm0, %mm2
209 movq 4(%ebx,%eax,4), %mm0
211 psrlq %mm6, %mm1
212 movq %mm2, -12(%ecx,%eax,4)
214 movq %mm0, %mm2
215 psllq %mm7, %mm0
217 por %mm0, %mm1
218 movq 12(%ebx,%eax,4), %mm0
220 movq %mm1, -4(%ecx,%eax,4)
221 ja L(top) C jump if no carry and not zero
225 C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
226 C to 3 representing respectively 3 to 0 further limbs.
228 testl $2, %eax C testl to avoid bad cache line crossings
229 jnz L(finish_nottwo)
231 C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
232 C becomes new mm2 and a new mm0 is loaded.
234 psrlq %mm6, %mm2
235 movq %mm0, %mm1
237 psllq %mm7, %mm0
238 addl $2, %eax
240 por %mm0, %mm2
241 movq 12(%ebx,%eax,4), %mm0
243 movq %mm2, -4(%ecx,%eax,4)
244 movq %mm1, %mm2
245 L(finish_nottwo):
248 testb $1, %al
249 psrlq %mm6, %mm2
251 movq %mm0, %mm1
252 psllq %mm7, %mm0
254 por %mm0, %mm2
255 psrlq %mm6, %mm1
257 movq %mm2, 4(%ecx,%eax,4)
258 jnz L(finish_even)
261 C one further extra limb to process
263 movd 32-4(%ebx), %mm0 C src[size-1], most significant limb
264 popl %ebx
266 movq %mm0, %mm2
267 psllq %mm7, %mm0
269 por %mm0, %mm1
270 psrlq %mm6, %mm2
272 movq %mm1, 32-12(%ecx) C dst[size-3,size-2]
273 movd %mm2, 32-4(%ecx) C dst[size-1]
275 movl %edx, %eax C retval
277 femms
281 nop C avoid bad cache line crossing
282 L(finish_even):
283 C no further extra limbs
285 movq %mm1, 32-8(%ecx) C dst[size-2,size-1]
286 movl %edx, %eax C retval
288 popl %ebx
290 femms
293 EPILOGUE()