beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / pentium4 / lshiftc.asm
blobfc64676574eb4f70a91615b142b4f31ef056d7b0
1 dnl x86-64 mpn_lshiftc optimized for Pentium 4.
3 dnl Copyright 2003, 2005, 2007, 2008, 2010, 2012 Free Software Foundation,
4 dnl Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9 ?
37 C AMD K10 ?
38 C Intel P4 4.15
39 C Intel core2 ?
40 C Intel corei ?
41 C Intel atom ?
42 C VIA nano ?
44 C INPUT PARAMETERS
45 define(`rp',`%rdi')
46 define(`up',`%rsi')
47 define(`n',`%rdx')
48 define(`cnt',`%cl')
50 ABI_SUPPORT(DOS64)
51 ABI_SUPPORT(STD64)
53 ASM_START()
54 TEXT
55 ALIGN(32)
56 PROLOGUE(mpn_lshiftc)
57 FUNC_ENTRY(4)
58 mov -8(up,n,8), %rax
59 pcmpeqd %mm6, %mm6 C 0xffff...fff
60 movd R32(%rcx), %mm4
61 neg R32(%rcx) C put rsh count in cl
62 and $63, R32(%rcx)
63 movd R32(%rcx), %mm5
65 lea 1(n), R32(%r8)
67 shr R8(%rcx), %rax C function return value
69 and $3, R32(%r8)
70 je L(rol) C jump for n = 3, 7, 11, ...
72 dec R32(%r8)
73 jne L(1)
74 C n = 4, 8, 12, ...
75 movq -8(up,n,8), %mm2
76 psllq %mm4, %mm2
77 movq -16(up,n,8), %mm0
78 pxor %mm6, %mm2
79 psrlq %mm5, %mm0
80 pandn %mm2, %mm0
81 movq %mm0, -8(rp,n,8)
82 dec n
83 jmp L(rol)
85 L(1): dec R32(%r8)
86 je L(1x) C jump for n = 1, 5, 9, 13, ...
87 C n = 2, 6, 10, 16, ...
88 movq -8(up,n,8), %mm2
89 psllq %mm4, %mm2
90 movq -16(up,n,8), %mm0
91 pxor %mm6, %mm2
92 psrlq %mm5, %mm0
93 pandn %mm2, %mm0
94 movq %mm0, -8(rp,n,8)
95 dec n
96 L(1x):
97 cmp $1, n
98 je L(ast)
99 movq -8(up,n,8), %mm2
100 psllq %mm4, %mm2
101 movq -16(up,n,8), %mm3
102 psllq %mm4, %mm3
103 movq -16(up,n,8), %mm0
104 movq -24(up,n,8), %mm1
105 pxor %mm6, %mm2
106 psrlq %mm5, %mm0
107 pandn %mm2, %mm0
108 pxor %mm6, %mm3
109 psrlq %mm5, %mm1
110 pandn %mm3, %mm1
111 movq %mm0, -8(rp,n,8)
112 movq %mm1, -16(rp,n,8)
113 sub $2, n
115 L(rol): movq -8(up,n,8), %mm2
116 psllq %mm4, %mm2
117 movq -16(up,n,8), %mm3
118 psllq %mm4, %mm3
120 sub $4, n
121 jb L(end)
122 ALIGN(32)
123 L(top):
124 C finish stuff from lsh block
125 movq 16(up,n,8), %mm0
126 pxor %mm6, %mm2
127 movq 8(up,n,8), %mm1
128 psrlq %mm5, %mm0
129 psrlq %mm5, %mm1
130 pandn %mm2, %mm0
131 pxor %mm6, %mm3
132 movq %mm0, 24(rp,n,8)
133 movq (up,n,8), %mm0
134 pandn %mm3, %mm1
135 movq %mm1, 16(rp,n,8)
136 movq -8(up,n,8), %mm1
137 C start two new rsh
138 psrlq %mm5, %mm0
139 psrlq %mm5, %mm1
141 C finish stuff from rsh block
142 movq 8(up,n,8), %mm2
143 pxor %mm6, %mm0
144 movq (up,n,8), %mm3
145 psllq %mm4, %mm2
146 psllq %mm4, %mm3
147 pandn %mm0, %mm2
148 pxor %mm6, %mm1
149 movq %mm2, 8(rp,n,8)
150 movq -8(up,n,8), %mm2
151 pandn %mm1, %mm3
152 movq %mm3, (rp,n,8)
153 movq -16(up,n,8), %mm3
154 C start two new lsh
155 sub $4, n
156 psllq %mm4, %mm2
157 psllq %mm4, %mm3
159 jae L(top)
161 L(end): pxor %mm6, %mm2
162 movq 8(up), %mm0
163 psrlq %mm5, %mm0
164 pandn %mm2, %mm0
165 pxor %mm6, %mm3
166 movq (up), %mm1
167 psrlq %mm5, %mm1
168 pandn %mm3, %mm1
169 movq %mm0, 16(rp)
170 movq %mm1, 8(rp)
172 L(ast): movq (up), %mm2
173 psllq %mm4, %mm2
174 pxor %mm6, %mm2
175 movq %mm2, (rp)
176 emms
177 FUNC_EXIT()
179 EPILOGUE()