beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / pentium4 / rshift.asm
blobb7c1ee2cdd49aa80bc90f10a556aec59313bce4d
1 dnl x86-64 mpn_rshift optimized for Pentium 4.
3 dnl Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C AMD K8,K9 2.5
36 C AMD K10 ?
37 C Intel P4 3.29
38 C Intel core2 2.1 (fluctuates, presumably cache related)
39 C Intel corei ?
40 C Intel atom 14.3
41 C VIA nano ?
43 C INPUT PARAMETERS
44 define(`rp',`%rdi')
45 define(`up',`%rsi')
46 define(`n',`%rdx')
47 define(`cnt',`%cl')
49 ABI_SUPPORT(DOS64)
50 ABI_SUPPORT(STD64)
52 ASM_START()
53 TEXT
54 ALIGN(32)
55 PROLOGUE(mpn_rshift)
56 FUNC_ENTRY(4)
57 mov (up), %rax
58 movd R32(%rcx), %mm4
59 neg R32(%rcx) C put lsh count in cl
60 and $63, R32(%rcx)
61 movd R32(%rcx), %mm5
63 lea -8(up,n,8), up
64 lea -8(rp,n,8), rp
65 lea 1(n), R32(%r8)
66 neg n
68 shl R8(%rcx), %rax C function return value
70 and $3, R32(%r8)
71 je L(rol) C jump for n = 3, 7, 11, ...
73 dec R32(%r8)
74 jne L(1)
75 C n = 4, 8, 12, ...
76 movq 8(up,n,8), %mm2
77 psrlq %mm4, %mm2
78 movq 16(up,n,8), %mm0
79 psllq %mm5, %mm0
80 por %mm0, %mm2
81 movq %mm2, 8(rp,n,8)
82 inc n
83 jmp L(rol)
85 L(1): dec R32(%r8)
86 je L(1x) C jump for n = 1, 5, 9, 13, ...
87 C n = 2, 6, 10, 16, ...
88 movq 8(up,n,8), %mm2
89 psrlq %mm4, %mm2
90 movq 16(up,n,8), %mm0
91 psllq %mm5, %mm0
92 por %mm0, %mm2
93 movq %mm2, 8(rp,n,8)
94 inc n
95 L(1x):
96 cmp $-1, n
97 je L(ast)
98 movq 8(up,n,8), %mm2
99 psrlq %mm4, %mm2
100 movq 16(up,n,8), %mm3
101 psrlq %mm4, %mm3
102 movq 16(up,n,8), %mm0
103 movq 24(up,n,8), %mm1
104 psllq %mm5, %mm0
105 por %mm0, %mm2
106 psllq %mm5, %mm1
107 por %mm1, %mm3
108 movq %mm2, 8(rp,n,8)
109 movq %mm3, 16(rp,n,8)
110 add $2, n
112 L(rol): movq 8(up,n,8), %mm2
113 psrlq %mm4, %mm2
114 movq 16(up,n,8), %mm3
115 psrlq %mm4, %mm3
117 add $4, n C 4
118 jb L(end) C 2
119 ALIGN(32)
120 L(top):
121 C finish stuff from lsh block
122 movq -16(up,n,8), %mm0
123 movq -8(up,n,8), %mm1
124 psllq %mm5, %mm0
125 por %mm0, %mm2
126 psllq %mm5, %mm1
127 movq (up,n,8), %mm0
128 por %mm1, %mm3
129 movq 8(up,n,8), %mm1
130 movq %mm2, -24(rp,n,8)
131 movq %mm3, -16(rp,n,8)
132 C start two new rsh
133 psllq %mm5, %mm0
134 psllq %mm5, %mm1
136 C finish stuff from rsh block
137 movq -8(up,n,8), %mm2
138 movq (up,n,8), %mm3
139 psrlq %mm4, %mm2
140 por %mm2, %mm0
141 psrlq %mm4, %mm3
142 movq 8(up,n,8), %mm2
143 por %mm3, %mm1
144 movq 16(up,n,8), %mm3
145 movq %mm0, -8(rp,n,8)
146 movq %mm1, (rp,n,8)
147 C start two new lsh
148 add $4, n
149 psrlq %mm4, %mm2
150 psrlq %mm4, %mm3
152 jae L(top) C 2
153 L(end):
154 movq -8(up), %mm0
155 psllq %mm5, %mm0
156 por %mm0, %mm2
157 movq (up), %mm1
158 psllq %mm5, %mm1
159 por %mm1, %mm3
160 movq %mm2, -16(rp)
161 movq %mm3, -8(rp)
163 L(ast): movq (up), %mm2
164 psrlq %mm4, %mm2
165 movq %mm2, (rp)
166 emms
167 FUNC_EXIT()
169 EPILOGUE()