beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k7 / mmx / rshift.asm
blob345d23a25e3881b643dcbc495794a9dfcdc5f884
1 dnl AMD K7 mpn_rshift -- mpn right shift.
3 dnl Copyright 1999-2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C K7: 1.21 cycles/limb (at 16 limbs/loop).
38 dnl K7: UNROLL_COUNT cycles/limb
39 dnl 4 1.51
40 dnl 8 1.26
41 dnl 16 1.21
42 dnl 32 1.2
43 dnl Maximum possible with the current code is 64.
45 deflit(UNROLL_COUNT, 16)
48 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
49 C unsigned shift);
51 C Shift src,size right by shift many bits and store the result in dst,size.
52 C Zeros are shifted in at the left. The bits shifted out at the right are
53 C the return value.
55 C This code uses 64-bit MMX operations, which makes it possible to handle
56 C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer
57 C code, on the other hand, suffers from shrd being a vector path decode and
58 C running at 3 cycles back-to-back.
60 C Full speed depends on source and destination being aligned, and some hairy
61 C setups and finish-ups are done to arrange this for the loop.
63 ifdef(`PIC',`
64 deflit(UNROLL_THRESHOLD, 10)
65 ',`
66 deflit(UNROLL_THRESHOLD, 10)
69 defframe(PARAM_SHIFT,16)
70 defframe(PARAM_SIZE, 12)
71 defframe(PARAM_SRC, 8)
72 defframe(PARAM_DST, 4)
74 defframe(SAVE_EDI, -4)
75 defframe(SAVE_ESI, -8)
76 defframe(SAVE_EBX, -12)
77 deflit(SAVE_SIZE, 12)
79 TEXT
80 ALIGN(32)
82 PROLOGUE(mpn_rshift)
83 deflit(`FRAME',0)
85 movl PARAM_SIZE, %eax
86 movl PARAM_SRC, %edx
87 subl $SAVE_SIZE, %esp
88 deflit(`FRAME',SAVE_SIZE)
90 movl PARAM_SHIFT, %ecx
91 movl %edi, SAVE_EDI
93 movl PARAM_DST, %edi
94 decl %eax
95 jnz L(more_than_one_limb)
97 movl (%edx), %edx C src limb
99 shrdl( %cl, %edx, %eax) C eax was decremented to zero
101 shrl %cl, %edx
103 movl %edx, (%edi) C dst limb
104 movl SAVE_EDI, %edi
105 addl $SAVE_SIZE, %esp
110 C -----------------------------------------------------------------------------
111 L(more_than_one_limb):
112 C eax size-1
113 C ebx
114 C ecx shift
115 C edx src
116 C esi
117 C edi dst
118 C ebp
120 movd PARAM_SHIFT, %mm6 C rshift
121 movd (%edx), %mm5 C src low limb
122 cmp $UNROLL_THRESHOLD-1, %eax
124 jae L(unroll)
125 leal (%edx,%eax,4), %edx C &src[size-1]
126 leal -4(%edi,%eax,4), %edi C &dst[size-2]
128 movd (%edx), %mm4 C src high limb
129 negl %eax
132 L(simple_top):
133 C eax loop counter, limbs, negative
134 C ebx
135 C ecx shift
136 C edx carry
137 C edx &src[size-1]
138 C edi &dst[size-2]
139 C ebp
141 C mm0 scratch
142 C mm4 src high limb
143 C mm5 src low limb
144 C mm6 shift
146 movq (%edx,%eax,4), %mm0
147 incl %eax
149 psrlq %mm6, %mm0
151 movd %mm0, (%edi,%eax,4)
152 jnz L(simple_top)
155 psllq $32, %mm5
156 psrlq %mm6, %mm4
158 psrlq %mm6, %mm5
159 movd %mm4, 4(%edi) C dst high limb
161 movd %mm5, %eax C return value
163 movl SAVE_EDI, %edi
164 addl $SAVE_SIZE, %esp
165 emms
170 C -----------------------------------------------------------------------------
171 ALIGN(16)
172 L(unroll):
173 C eax size-1
174 C ebx
175 C ecx shift
176 C edx src
177 C esi
178 C edi dst
179 C ebp
181 C mm5 src low limb
182 C mm6 rshift
184 testb $4, %dl
185 movl %esi, SAVE_ESI
186 movl %ebx, SAVE_EBX
188 psllq $32, %mm5
189 jz L(start_src_aligned)
192 C src isn't aligned, process low limb separately (marked xxx) and
193 C step src and dst by one limb, making src aligned.
195 C source edx
196 C --+-------+-------+-------+
197 C | xxx |
198 C --+-------+-------+-------+
199 C 4mod8 0mod8 4mod8
201 C dest edi
202 C --+-------+-------+
203 C | | xxx |
204 C --+-------+-------+
206 movq (%edx), %mm0 C src low two limbs
207 addl $4, %edx
208 movl %eax, PARAM_SIZE C size-1
210 addl $4, %edi
211 decl %eax C size-2 is new size-1
213 psrlq %mm6, %mm0
214 movl %edi, PARAM_DST C new dst
216 movd %mm0, -4(%edi)
217 L(start_src_aligned):
220 movq (%edx), %mm1 C src low two limbs
221 decl %eax C size-2, two last limbs handled at end
222 testl $4, %edi
224 psrlq %mm6, %mm5
225 jz L(start_dst_aligned)
228 C dst isn't aligned, add 4 to make it so, and pretend the shift is
229 C 32 bits extra. Low limb of dst (marked xxx) handled here separately.
231 C source edx
232 C --+-------+-------+
233 C | mm1 |
234 C --+-------+-------+
235 C 4mod8 0mod8
237 C dest edi
238 C --+-------+-------+-------+
239 C | xxx |
240 C --+-------+-------+-------+
241 C 4mod8 0mod8 4mod8
243 movq %mm1, %mm0
244 psrlq %mm6, %mm1
245 addl $32, %ecx C shift+32
247 movd %mm1, (%edi)
248 movq %mm0, %mm1
249 addl $4, %edi C new dst
251 movd %ecx, %mm6
252 L(start_dst_aligned):
255 movq %mm1, %mm2 C copy of src low two limbs
256 negl %ecx
257 andl $-2, %eax C round size down to even
259 movl %eax, %ebx
260 negl %eax
261 addl $64, %ecx
263 andl $UNROLL_MASK, %eax
264 decl %ebx
266 shll %eax
268 movd %ecx, %mm7 C lshift = 64-rshift
270 ifdef(`PIC',`
271 call L(pic_calc)
272 L(here):
274 leal L(entry) (%eax,%eax,4), %esi
275 negl %eax
277 shrl $UNROLL_LOG2, %ebx C loop counter
279 leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
280 leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
281 movl PARAM_SIZE, %eax C for use at end
283 jmp *%esi
286 ifdef(`PIC',`
287 L(pic_calc):
288 C See mpn/x86/README about old gas bugs
289 leal (%eax,%eax,4), %esi
290 addl $L(entry)-L(here), %esi
291 addl (%esp), %esi
292 negl %eax
294 ret_internal
298 C -----------------------------------------------------------------------------
299 ALIGN(64)
300 L(top):
301 C eax size, for use at end
302 C ebx loop counter
303 C ecx lshift
304 C edx src
305 C esi was computed jump
306 C edi dst
307 C ebp
309 C mm0 scratch
310 C mm1 \ carry (alternating)
311 C mm2 /
312 C mm6 rshift
313 C mm7 lshift
315 C 10 code bytes/limb
317 C The two chunks differ in whether mm1 or mm2 hold the carry.
318 C The computed jump puts the initial carry in both mm1 and mm2.
320 L(entry):
321 deflit(CHUNK_COUNT, 4)
322 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
323 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
324 deflit(`disp1', eval(disp0 + 8))
326 Zdisp( movq, disp0,(%edx), %mm0)
327 psrlq %mm6, %mm2
329 movq %mm0, %mm1
330 psllq %mm7, %mm0
332 por %mm2, %mm0
333 Zdisp( movq, %mm0, disp0,(%edi))
336 Zdisp( movq, disp1,(%edx), %mm0)
337 psrlq %mm6, %mm1
339 movq %mm0, %mm2
340 psllq %mm7, %mm0
342 por %mm1, %mm0
343 Zdisp( movq, %mm0, disp1,(%edi))
346 addl $UNROLL_BYTES, %edx
347 addl $UNROLL_BYTES, %edi
348 decl %ebx
350 jns L(top)
353 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
354 deflit(`disp1', eval(disp0-0 + 8))
356 testb $1, %al
357 psrlq %mm6, %mm2 C wanted rshifted in all cases below
358 movl SAVE_ESI, %esi
360 movd %mm5, %eax C return value
362 movl SAVE_EBX, %ebx
363 jz L(end_even)
366 C Size odd, destination was aligned.
368 C source
369 C edx
370 C +-------+---------------+--
371 C | | mm2 |
372 C +-------+---------------+--
374 C dest edi
375 C +-------+---------------+---------------+--
376 C | | | written |
377 C +-------+---------------+---------------+--
379 C mm6 = shift
380 C mm7 = ecx = 64-shift
383 C Size odd, destination was unaligned.
385 C source
386 C edx
387 C +-------+---------------+--
388 C | | mm2 |
389 C +-------+---------------+--
391 C dest edi
392 C +---------------+---------------+--
393 C | | written |
394 C +---------------+---------------+--
396 C mm6 = shift+32
397 C mm7 = ecx = 64-(shift+32)
400 C In both cases there's one extra limb of src to fetch and combine
401 C with mm2 to make a qword to store, and in the aligned case there's
402 C a further extra limb of dst to be formed.
405 movd disp0(%edx), %mm0
406 movq %mm0, %mm1
408 psllq %mm7, %mm0
409 testb $32, %cl
411 por %mm2, %mm0
412 psrlq %mm6, %mm1
414 movq %mm0, disp0(%edi)
415 jz L(finish_odd_unaligned)
417 movd %mm1, disp1(%edi)
418 L(finish_odd_unaligned):
420 movl SAVE_EDI, %edi
421 addl $SAVE_SIZE, %esp
422 emms
427 L(end_even):
429 C Size even, destination was aligned.
431 C source
432 C +---------------+--
433 C | mm2 |
434 C +---------------+--
436 C dest edi
437 C +---------------+---------------+--
438 C | | mm3 |
439 C +---------------+---------------+--
441 C mm6 = shift
442 C mm7 = ecx = 64-shift
445 C Size even, destination was unaligned.
447 C source
448 C +---------------+--
449 C | mm2 |
450 C +---------------+--
452 C dest edi
453 C +-------+---------------+--
454 C | | mm3 |
455 C +-------+---------------+--
457 C mm6 = shift+32
458 C mm7 = 64-(shift+32)
461 C The movd for the unaligned case is the same data as the movq for
462 C the aligned case, it's just a choice between whether one or two
463 C limbs should be written.
466 testb $32, %cl
467 movd %mm2, disp0(%edi)
469 jz L(end_even_unaligned)
471 movq %mm2, disp0(%edi)
472 L(end_even_unaligned):
474 movl SAVE_EDI, %edi
475 addl $SAVE_SIZE, %esp
476 emms
480 EPILOGUE()