beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / fastsse / lshift.asm
blobf76972a22f63e3f32eef43fd504984010dd8570c
1 dnl AMD64 mpn_lshift optimised for CPUs with fast SSE.
3 dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund.
5 dnl Copyright 2010-2012 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
36 C cycles/limb cycles/limb good
37 C 16-byte aligned 16-byte unaligned for cpu?
38 C AMD K8,K9 ? ?
39 C AMD K10 1.68 (1.45) 1.75 (1.49) Y
40 C AMD bd1 1.82 (1.75) 1.82 (1.75) Y
41 C AMD bobcat 4 4
42 C Intel P4 3 (2.7) 3 (2.7) Y
43 C Intel core2 2.05 (1.67) 2.55 (1.75)
44 C Intel NHM 2.05 (1.75) 2.09 (2)
45 C Intel SBR 1.5 (1.3125) 1.5 (1.4375) Y
46 C Intel atom ? ?
47 C VIA nano 2.25 (2) 2.5 (2) Y
49 C We try to do as many 16-byte operations as possible. The top-most and
50 C bottom-most writes might need 8-byte operations.
52 C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
53 C not true. The aligned case reads 16+8 bytes, the unaligned case reads
54 C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
56 C This is not yet great code:
57 C (1) The unaligned case makes many reads.
58 C (2) We should do some unrolling, at least 2-way.
59 C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
60 C Nano.
62 C INPUT PARAMETERS
63 define(`rp', `%rdi')
64 define(`ap', `%rsi')
65 define(`n', `%rdx')
66 define(`cnt', `%rcx')
68 ASM_START()
69 TEXT
70 ALIGN(64)
71 PROLOGUE(mpn_lshift)
72 movd R32(%rcx), %xmm4
73 mov $64, R32(%rax)
74 sub R32(%rcx), R32(%rax)
75 movd R32(%rax), %xmm5
77 neg R32(%rcx)
78 mov -8(ap,n,8), %rax
79 shr R8(%rcx), %rax
81 cmp $2, n
82 jle L(le2)
84 lea (rp,n,8), R32(%rcx)
85 test $8, R8(%rcx)
86 je L(rp_aligned)
88 C Do one initial limb in order to make rp aligned
89 movq -8(ap,n,8), %xmm0
90 movq -16(ap,n,8), %xmm1
91 psllq %xmm4, %xmm0
92 psrlq %xmm5, %xmm1
93 por %xmm1, %xmm0
94 movq %xmm0, -8(rp,n,8)
95 dec n
97 L(rp_aligned):
98 lea (ap,n,8), R32(%rcx)
99 test $8, R8(%rcx)
100 je L(aent)
101 jmp L(uent)
102 C *****************************************************************************
104 C Handle the case when ap != rp (mod 16).
106 ALIGN(16)
107 L(utop):movdqa -8(ap,n,8), %xmm0
108 movq (ap,n,8), %xmm1
109 punpcklqdq 8(ap,n,8), %xmm1
110 psllq %xmm4, %xmm1
111 psrlq %xmm5, %xmm0
112 por %xmm1, %xmm0
113 movdqa %xmm0, (rp,n,8)
114 L(uent):sub $2, n
115 ja L(utop)
117 jne L(end8)
119 movq (ap), %xmm1
120 pxor %xmm0, %xmm0
121 punpcklqdq %xmm1, %xmm0
122 punpcklqdq 8(ap), %xmm1
123 psllq %xmm4, %xmm1
124 psrlq %xmm5, %xmm0
125 por %xmm1, %xmm0
126 movdqa %xmm0, (rp)
128 C *****************************************************************************
130 C Handle the case when ap = rp (mod 16).
132 ALIGN(16)
133 L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2]
134 movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3]
135 punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3]
136 psllq %xmm4, %xmm0
137 psrlq %xmm5, %xmm1
138 por %xmm1, %xmm0
139 movdqa %xmm0, (rp,n,8)
140 L(aent):
141 sub $2, n
142 ja L(atop)
143 jne L(end8)
145 movdqa (ap), %xmm1
146 pxor %xmm0, %xmm0
147 punpcklqdq %xmm1, %xmm0
148 psllq %xmm4, %xmm1
149 psrlq %xmm5, %xmm0
150 por %xmm1, %xmm0
151 movdqa %xmm0, (rp)
153 C *****************************************************************************
155 ALIGN(16)
156 L(le2): jne L(end8)
158 movq 8(ap), %xmm0
159 movq (ap), %xmm1
160 psllq %xmm4, %xmm0
161 psrlq %xmm5, %xmm1
162 por %xmm1, %xmm0
163 movq %xmm0, 8(rp)
165 L(end8):movq (ap), %xmm0
166 psllq %xmm4, %xmm0
167 movq %xmm0, (rp)
169 EPILOGUE()