beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / pentium4 / lshift.asm
blobd3b521364fffc9ca541f794bd2187c96fc28a208
1 dnl x86-64 mpn_lshift optimized for Pentium 4.
3 dnl Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C AMD K8,K9 2.5
36 C AMD K10 ?
37 C Intel P4 3.29
38 C Intel core2 2.1 (fluctuates, presumably cache related)
39 C Intel corei ?
40 C Intel atom 14.3
41 C VIA nano ?
43 C INPUT PARAMETERS
44 define(`rp',`%rdi')
45 define(`up',`%rsi')
46 define(`n',`%rdx')
47 define(`cnt',`%cl')
49 ABI_SUPPORT(DOS64)
50 ABI_SUPPORT(STD64)
52 ASM_START()
53 TEXT
54 ALIGN(32)
55 PROLOGUE(mpn_lshift)
56 FUNC_ENTRY(4)
57 mov -8(up,n,8), %rax
58 movd R32(%rcx), %mm4
59 neg R32(%rcx) C put rsh count in cl
60 and $63, R32(%rcx)
61 movd R32(%rcx), %mm5
63 lea 1(n), R32(%r8)
65 shr R8(%rcx), %rax C function return value
67 and $3, R32(%r8)
68 je L(rol) C jump for n = 3, 7, 11, ...
70 dec R32(%r8)
71 jne L(1)
72 C n = 4, 8, 12, ...
73 movq -8(up,n,8), %mm2
74 psllq %mm4, %mm2
75 movq -16(up,n,8), %mm0
76 psrlq %mm5, %mm0
77 por %mm0, %mm2
78 movq %mm2, -8(rp,n,8)
79 dec n
80 jmp L(rol)
82 L(1): dec R32(%r8)
83 je L(1x) C jump for n = 1, 5, 9, 13, ...
84 C n = 2, 6, 10, 16, ...
85 movq -8(up,n,8), %mm2
86 psllq %mm4, %mm2
87 movq -16(up,n,8), %mm0
88 psrlq %mm5, %mm0
89 por %mm0, %mm2
90 movq %mm2, -8(rp,n,8)
91 dec n
92 L(1x):
93 cmp $1, n
94 je L(ast)
95 movq -8(up,n,8), %mm2
96 psllq %mm4, %mm2
97 movq -16(up,n,8), %mm3
98 psllq %mm4, %mm3
99 movq -16(up,n,8), %mm0
100 movq -24(up,n,8), %mm1
101 psrlq %mm5, %mm0
102 por %mm0, %mm2
103 psrlq %mm5, %mm1
104 por %mm1, %mm3
105 movq %mm2, -8(rp,n,8)
106 movq %mm3, -16(rp,n,8)
107 sub $2, n
109 L(rol): movq -8(up,n,8), %mm2
110 psllq %mm4, %mm2
111 movq -16(up,n,8), %mm3
112 psllq %mm4, %mm3
114 sub $4, n C 4
115 jb L(end) C 2
116 ALIGN(32)
117 L(top):
118 C finish stuff from lsh block
119 movq 16(up,n,8), %mm0
120 movq 8(up,n,8), %mm1
121 psrlq %mm5, %mm0
122 por %mm0, %mm2
123 psrlq %mm5, %mm1
124 movq (up,n,8), %mm0
125 por %mm1, %mm3
126 movq -8(up,n,8), %mm1
127 movq %mm2, 24(rp,n,8)
128 movq %mm3, 16(rp,n,8)
129 C start two new rsh
130 psrlq %mm5, %mm0
131 psrlq %mm5, %mm1
133 C finish stuff from rsh block
134 movq 8(up,n,8), %mm2
135 movq (up,n,8), %mm3
136 psllq %mm4, %mm2
137 por %mm2, %mm0
138 psllq %mm4, %mm3
139 movq -8(up,n,8), %mm2
140 por %mm3, %mm1
141 movq -16(up,n,8), %mm3
142 movq %mm0, 8(rp,n,8)
143 movq %mm1, (rp,n,8)
144 C start two new lsh
145 sub $4, n
146 psllq %mm4, %mm2
147 psllq %mm4, %mm3
149 jae L(top) C 2
150 L(end):
151 movq 8(up), %mm0
152 psrlq %mm5, %mm0
153 por %mm0, %mm2
154 movq (up), %mm1
155 psrlq %mm5, %mm1
156 por %mm1, %mm3
157 movq %mm2, 16(rp)
158 movq %mm3, 8(rp)
160 L(ast): movq (up), %mm2
161 psllq %mm4, %mm2
162 movq %mm2, (rp)
163 emms
164 FUNC_EXIT()
166 EPILOGUE()