beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / core2 / lshiftc.asm
blob65c7b2f1b8680c78619cf0c2945e84914e99cfaf
1 dnl x86-64 mpn_lshiftc optimized for "Core 2".
3 dnl Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C AMD K8,K9 ?
36 C AMD K10 ?
37 C Intel P4 ?
38 C Intel core2 1.5
39 C Intel NHM 2.25 (up to about n = 260, then 1.875)
40 C Intel SBR 2.25
41 C Intel atom ?
42 C VIA nano ?
45 C INPUT PARAMETERS
46 define(`rp', `%rdi')
47 define(`up', `%rsi')
48 define(`n', `%rdx')
49 define(`cnt', `%rcx')
51 ABI_SUPPORT(DOS64)
52 ABI_SUPPORT(STD64)
54 ASM_START()
55 TEXT
56 ALIGN(16)
57 PROLOGUE(mpn_lshiftc)
58 FUNC_ENTRY(4)
59 lea -8(rp,n,8), rp
60 lea -8(up,n,8), up
62 mov R32(%rdx), R32(%rax)
63 and $3, R32(%rax)
64 jne L(nb00)
65 L(b00): C n = 4, 8, 12, ...
66 mov (up), %r10
67 mov -8(up), %r11
68 xor R32(%rax), R32(%rax)
69 shld R8(cnt), %r10, %rax
70 mov -16(up), %r8
71 lea 24(rp), rp
72 sub $4, n
73 jmp L(00)
75 L(nb00):C n = 1, 5, 9, ...
76 cmp $2, R32(%rax)
77 jae L(nb01)
78 L(b01): mov (up), %r9
79 xor R32(%rax), R32(%rax)
80 shld R8(cnt), %r9, %rax
81 sub $2, n
82 jb L(le1)
83 mov -8(up), %r10
84 mov -16(up), %r11
85 lea -8(up), up
86 lea 16(rp), rp
87 jmp L(01)
88 L(le1): shl R8(cnt), %r9
89 not %r9
90 mov %r9, (rp)
91 FUNC_EXIT()
92 ret
94 L(nb01):C n = 2, 6, 10, ...
95 jne L(b11)
96 L(b10): mov (up), %r8
97 mov -8(up), %r9
98 xor R32(%rax), R32(%rax)
99 shld R8(cnt), %r8, %rax
100 sub $3, n
101 jb L(le2)
102 mov -16(up), %r10
103 lea -16(up), up
104 lea 8(rp), rp
105 jmp L(10)
106 L(le2): shld R8(cnt), %r9, %r8
107 not %r8
108 mov %r8, (rp)
109 shl R8(cnt), %r9
110 not %r9
111 mov %r9, -8(rp)
112 FUNC_EXIT()
115 ALIGN(16) C performance critical!
116 L(b11): C n = 3, 7, 11, ...
117 mov (up), %r11
118 mov -8(up), %r8
119 xor R32(%rax), R32(%rax)
120 shld R8(cnt), %r11, %rax
121 mov -16(up), %r9
122 lea -24(up), up
123 sub $4, n
124 jb L(end)
126 ALIGN(16)
127 L(top): shld R8(cnt), %r8, %r11
128 mov (up), %r10
129 not %r11
130 mov %r11, (rp)
131 L(10): shld R8(cnt), %r9, %r8
132 mov -8(up), %r11
133 not %r8
134 mov %r8, -8(rp)
135 L(01): shld R8(cnt), %r10, %r9
136 mov -16(up), %r8
137 not %r9
138 mov %r9, -16(rp)
139 L(00): shld R8(cnt), %r11, %r10
140 mov -24(up), %r9
141 not %r10
142 mov %r10, -24(rp)
143 add $-32, up
144 lea -32(rp), rp
145 sub $4, n
146 jnc L(top)
148 L(end): shld R8(cnt), %r8, %r11
149 not %r11
150 mov %r11, (rp)
151 shld R8(cnt), %r9, %r8
152 not %r8
153 mov %r8, -8(rp)
154 shl R8(cnt), %r9
155 not %r9
156 mov %r9, -16(rp)
157 FUNC_EXIT()
159 EPILOGUE()