beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / fastsse / lshiftc-movdqu2.asm
blob8250910c52a60af1b30c3eda33889b28a5c84c63
1 dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2010-2012 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
36 C cycles/limb cycles/limb cycles/limb good
37 C aligned unaligned best seen for cpu?
38 C AMD K8,K9 3 3 ? no, use shl/shr
39 C AMD K10 1.8-2.0 1.8-2.0 ? yes
40 C AMD bd1 1.9 1.9 ? yes
41 C AMD bobcat 3.67 3.67 yes, bad for n < 20
42 C Intel P4 4.75 4.75 ? no, slow movdqu
43 C Intel core2 2.27 2.27 ? no, use shld/shrd
44 C Intel NHM 2.15 2.15 ? no, use shld/shrd
45 C Intel SBR 1.45 1.45 ? yes, bad for n = 4-6
46 C Intel atom 12.9 12.9 ? no
47 C VIA nano 6.18 6.44 ? no, slow movdqu
49 C We try to do as many aligned 16-byte operations as possible. The top-most
50 C and bottom-most writes might need 8-byte operations.
52 C This variant rely on fast load movdqu, and uses it even for aligned operands,
53 C in order to avoid the need for two separate loops.
55 C TODO
56 C * Could 2-limb wind-down code be simplified?
57 C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
58 C for other affected CPUs.
60 C INPUT PARAMETERS
61 define(`rp', `%rdi')
62 define(`ap', `%rsi')
63 define(`n', `%rdx')
64 define(`cnt', `%rcx')
66 ASM_START()
67 TEXT
68 ALIGN(64)
69 PROLOGUE(mpn_lshiftc)
70 FUNC_ENTRY(4)
71 movd R32(%rcx), %xmm4
72 mov $64, R32(%rax)
73 sub R32(%rcx), R32(%rax)
74 movd R32(%rax), %xmm5
76 neg R32(%rcx)
77 mov -8(ap,n,8), %rax
78 shr R8(%rcx), %rax
80 pcmpeqb %xmm3, %xmm3 C set to 111...111
82 cmp $3, n
83 jle L(bc)
85 lea (rp,n,8), R32(%rcx)
86 test $8, R8(%rcx)
87 jz L(rp_aligned)
89 C Do one initial limb in order to make rp aligned
90 movq -8(ap,n,8), %xmm0
91 movq -16(ap,n,8), %xmm1
92 psllq %xmm4, %xmm0
93 psrlq %xmm5, %xmm1
94 por %xmm1, %xmm0
95 pxor %xmm3, %xmm0
96 movq %xmm0, -8(rp,n,8)
97 dec n
99 L(rp_aligned):
100 lea 1(n), %r8d
102 and $6, R32(%r8)
103 jz L(ba0)
104 cmp $4, R32(%r8)
105 jz L(ba4)
106 jc L(ba2)
107 L(ba6): add $-4, n
108 jmp L(i56)
109 L(ba0): add $-6, n
110 jmp L(i70)
111 L(ba4): add $-2, n
112 jmp L(i34)
113 L(ba2): add $-8, n
114 jle L(end)
116 ALIGN(16)
117 L(top): movdqu 40(ap,n,8), %xmm1
118 movdqu 48(ap,n,8), %xmm0
119 psllq %xmm4, %xmm0
120 psrlq %xmm5, %xmm1
121 por %xmm1, %xmm0
122 pxor %xmm3, %xmm0
123 movdqa %xmm0, 48(rp,n,8)
124 L(i70):
125 movdqu 24(ap,n,8), %xmm1
126 movdqu 32(ap,n,8), %xmm0
127 psllq %xmm4, %xmm0
128 psrlq %xmm5, %xmm1
129 por %xmm1, %xmm0
130 pxor %xmm3, %xmm0
131 movdqa %xmm0, 32(rp,n,8)
132 L(i56):
133 movdqu 8(ap,n,8), %xmm1
134 movdqu 16(ap,n,8), %xmm0
135 psllq %xmm4, %xmm0
136 psrlq %xmm5, %xmm1
137 por %xmm1, %xmm0
138 pxor %xmm3, %xmm0
139 movdqa %xmm0, 16(rp,n,8)
140 L(i34):
141 movdqu -8(ap,n,8), %xmm1
142 movdqu (ap,n,8), %xmm0
143 psllq %xmm4, %xmm0
144 psrlq %xmm5, %xmm1
145 por %xmm1, %xmm0
146 pxor %xmm3, %xmm0
147 movdqa %xmm0, (rp,n,8)
148 sub $8, n
149 jg L(top)
151 L(end): test $1, R8(n)
152 jnz L(end8)
154 movdqu (ap), %xmm1
155 pxor %xmm0, %xmm0
156 punpcklqdq %xmm1, %xmm0
157 psllq %xmm4, %xmm1
158 psrlq %xmm5, %xmm0
159 por %xmm1, %xmm0
160 pxor %xmm3, %xmm0
161 movdqa %xmm0, (rp)
162 FUNC_EXIT()
165 C Basecase
166 ALIGN(16)
167 L(bc): dec R32(n)
168 jz L(end8)
170 movq (ap,n,8), %xmm1
171 movq -8(ap,n,8), %xmm0
172 psllq %xmm4, %xmm1
173 psrlq %xmm5, %xmm0
174 por %xmm1, %xmm0
175 pxor %xmm3, %xmm0
176 movq %xmm0, (rp,n,8)
177 sub $2, R32(n)
178 jl L(end8)
179 movq 8(ap), %xmm1
180 movq (ap), %xmm0
181 psllq %xmm4, %xmm1
182 psrlq %xmm5, %xmm0
183 por %xmm1, %xmm0
184 pxor %xmm3, %xmm0
185 movq %xmm0, 8(rp)
187 L(end8):movq (ap), %xmm0
188 psllq %xmm4, %xmm0
189 pxor %xmm3, %xmm0
190 movq %xmm0, (rp)
191 FUNC_EXIT()
193 EPILOGUE()