beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / fastsse / lshiftc.asm
blobd2520690e223e4570520b962c0d76822c72226d8
1 dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE.
3 dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund.
5 dnl Copyright 2010-2012 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
36 C cycles/limb cycles/limb good
37 C 16-byte aligned 16-byte unaligned for cpu?
38 C AMD K8,K9 ? ?
39 C AMD K10 1.85 (1.635) 1.9 (1.67) Y
40 C AMD bd1 1.82 (1.75) 1.82 (1.75) Y
41 C AMD bobcat 4.5 4.5
42 C Intel P4 3.6 (3.125) 3.6 (3.125) Y
43 C Intel core2 2.05 (1.67) 2.55 (1.75)
44 C Intel NHM 2.05 (1.875) 2.6 (2.25)
45 C Intel SBR 1.55 (1.44) 2 (1.57) Y
46 C Intel atom ? ?
47 C VIA nano 2.5 (2.5) 2.5 (2.5) Y
49 C We try to do as many 16-byte operations as possible. The top-most and
50 C bottom-most writes might need 8-byte operations. We always write using
51 C 16-byte operations, we read with both 8-byte and 16-byte operations.
53 C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
54 C not true. The aligned case reads 16+8 bytes, the unaligned case reads
55 C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
57 C This is not yet great code:
58 C (1) The unaligned case makes too many reads.
59 C (2) We should do some unrolling, at least 2-way.
60 C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
61 C Nano.
63 C INPUT PARAMETERS
64 define(`rp', `%rdi')
65 define(`ap', `%rsi')
66 define(`n', `%rdx')
67 define(`cnt', `%rcx')
69 ASM_START()
70 TEXT
71 ALIGN(16)
72 PROLOGUE(mpn_lshiftc)
73 movd R32(%rcx), %xmm4
74 mov $64, R32(%rax)
75 sub R32(%rcx), R32(%rax)
76 movd R32(%rax), %xmm5
78 neg R32(%rcx)
79 mov -8(ap,n,8), %rax
80 shr R8(%rcx), %rax
82 pcmpeqb %xmm7, %xmm7 C set to 111...111
84 cmp $2, n
85 jle L(le2)
87 lea (rp,n,8), R32(%rcx)
88 test $8, R8(%rcx)
89 je L(rp_aligned)
91 C Do one initial limb in order to make rp aligned
92 movq -8(ap,n,8), %xmm0
93 movq -16(ap,n,8), %xmm1
94 psllq %xmm4, %xmm0
95 psrlq %xmm5, %xmm1
96 por %xmm1, %xmm0
97 pxor %xmm7, %xmm0
98 movq %xmm0, -8(rp,n,8)
99 dec n
101 L(rp_aligned):
102 lea (ap,n,8), R32(%rcx)
103 test $8, R8(%rcx)
104 je L(aent)
105 jmp L(uent)
106 C *****************************************************************************
108 C Handle the case when ap != rp (mod 16).
110 ALIGN(16)
111 L(utop):movq (ap,n,8), %xmm1
112 punpcklqdq 8(ap,n,8), %xmm1
113 movdqa -8(ap,n,8), %xmm0
114 psllq %xmm4, %xmm1
115 psrlq %xmm5, %xmm0
116 por %xmm1, %xmm0
117 pxor %xmm7, %xmm0
118 movdqa %xmm0, (rp,n,8)
119 L(uent):sub $2, n
120 ja L(utop)
122 jne L(end8)
124 movq (ap), %xmm1
125 pxor %xmm0, %xmm0
126 punpcklqdq %xmm1, %xmm0
127 punpcklqdq 8(ap), %xmm1
128 psllq %xmm4, %xmm1
129 psrlq %xmm5, %xmm0
130 por %xmm1, %xmm0
131 pxor %xmm7, %xmm0
132 movdqa %xmm0, (rp)
134 C *****************************************************************************
136 C Handle the case when ap = rp (mod 16).
138 ALIGN(16)
139 L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2]
140 movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3]
141 punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3]
142 psllq %xmm4, %xmm0
143 psrlq %xmm5, %xmm1
144 por %xmm1, %xmm0
145 pxor %xmm7, %xmm0
146 movdqa %xmm0, (rp,n,8)
147 L(aent):sub $2, n
148 ja L(atop)
150 jne L(end8)
152 movdqa (ap), %xmm0
153 pxor %xmm1, %xmm1
154 punpcklqdq %xmm0, %xmm1
155 psllq %xmm4, %xmm0
156 psrlq %xmm5, %xmm1
157 por %xmm1, %xmm0
158 pxor %xmm7, %xmm0
159 movdqa %xmm0, (rp)
161 C *****************************************************************************
163 ALIGN(16)
164 L(le2): jne L(end8)
166 movq 8(ap), %xmm0
167 movq (ap), %xmm1
168 psllq %xmm4, %xmm0
169 psrlq %xmm5, %xmm1
170 por %xmm1, %xmm0
171 pxor %xmm7, %xmm0
172 movq %xmm0, 8(rp)
174 L(end8):movq (ap), %xmm0
175 psllq %xmm4, %xmm0
176 pxor %xmm7, %xmm0
177 movq %xmm0, (rp)
179 EPILOGUE()