beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k6 / mmx / lshift.asm
blob45be58263378ab71a0e9e499f3428adc18b1cd30
1 dnl AMD K6 mpn_lshift -- mpn left shift.
3 dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C K6: 3.0 cycles/limb
37 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
38 C unsigned shift);
40 C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
41 C instructions. This is despite every second fetch being unaligned.
44 defframe(PARAM_SHIFT,16)
45 defframe(PARAM_SIZE, 12)
46 defframe(PARAM_SRC, 8)
47 defframe(PARAM_DST, 4)
49 TEXT
50 ALIGN(32)
52 PROLOGUE(mpn_lshift)
53 deflit(`FRAME',0)
55 C The 1 limb case can be done without the push %ebx, but it's then
56 C still the same speed. The push is left as a free helping hand for
57 C the two_or_more code.
59 movl PARAM_SIZE, %eax
60 pushl %ebx FRAME_pushl()
62 movl PARAM_SRC, %ebx
63 decl %eax
65 movl PARAM_SHIFT, %ecx
66 jnz L(two_or_more)
68 movl (%ebx), %edx C src limb
69 movl PARAM_DST, %ebx
71 shldl( %cl, %edx, %eax) C return value
73 shll %cl, %edx
75 movl %edx, (%ebx) C dst limb
76 popl %ebx
78 ret
81 ALIGN(16) C avoid offset 0x1f
82 nop C avoid bad cache line crossing
83 L(two_or_more):
84 C eax size-1
85 C ebx src
86 C ecx shift
87 C edx
89 movl (%ebx,%eax,4), %edx C src high limb
90 negl %ecx
92 movd PARAM_SHIFT, %mm6
93 addl $32, %ecx C 32-shift
95 shrl %cl, %edx
97 movd %ecx, %mm7
98 movl PARAM_DST, %ecx
100 L(top):
101 C eax counter, size-1 to 1
102 C ebx src
103 C ecx dst
104 C edx retval
106 C mm0 scratch
107 C mm6 shift
108 C mm7 32-shift
110 movq -4(%ebx,%eax,4), %mm0
111 decl %eax
113 psrlq %mm7, %mm0
115 movd %mm0, 4(%ecx,%eax,4)
116 jnz L(top)
119 movd (%ebx), %mm0
120 popl %ebx
122 psllq %mm6, %mm0
123 movl %edx, %eax
125 movd %mm0, (%ecx)
127 emms
130 EPILOGUE()