beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / fastsse / com.asm
blob307fb75373d0de5f80990aa05dbe3277c88ddeb8
1 dnl AMD64 mpn_com optimised for CPUs with fast SSE.
3 dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
4 dnl Inc.
6 dnl Contributed to the GNU project by Torbjorn Granlund.
8 dnl This file is part of the GNU MP Library.
9 dnl
10 dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 dnl it under the terms of either:
12 dnl
13 dnl * the GNU Lesser General Public License as published by the Free
14 dnl Software Foundation; either version 3 of the License, or (at your
15 dnl option) any later version.
16 dnl
17 dnl or
18 dnl
19 dnl * the GNU General Public License as published by the Free Software
20 dnl Foundation; either version 2 of the License, or (at your option) any
21 dnl later version.
22 dnl
23 dnl or both in parallel, as here.
24 dnl
25 dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 dnl for more details.
29 dnl
30 dnl You should have received copies of the GNU General Public License and the
31 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 dnl see https://www.gnu.org/licenses/.
34 include(`../config.m4')
36 C cycles/limb cycles/limb cycles/limb good
37 C aligned unaligned best seen for cpu?
38 C AMD K8,K9 2.0 2.0 N
39 C AMD K10 0.85 1.3 Y/N
40 C AMD bull 1.40 1.40 Y
41 C AMD pile 0.9-1.4 0.9-1.4 Y
42 C AMD steam
43 C AMD excavator
44 C AMD bobcat 3.1 3.1 N
45 C AMD jaguar 0.91 0.91 opt/opt Y
46 C Intel P4 2.28 illop Y
47 C Intel core2 1.02 1.02 N
48 C Intel NHM 0.53 0.68 Y
49 C Intel SBR 0.51 0.75 opt/0.65 Y/N
50 C Intel IBR 0.50 0.57 opt/opt Y
51 C Intel HWL 0.51 0.64 opt/0.58 Y
52 C Intel BWL 0.61 0.65 0.57/opt Y
53 C Intel atom 3.68 3.68 N
54 C Intel SLM 1.09 1.35 N
55 C VIA nano 1.17 5.09 Y/N
57 C We try to do as many 16-byte operations as possible. The top-most and
58 C bottom-most writes might need 8-byte operations. We can always write using
59 C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
60 C operations.
62 C Instead of having separate loops for reading aligned and unaligned, we read
63 C using MOVDQU. This seems to work great except for core2; there performance
64 C doubles when reading using MOVDQA (for aligned source). It is unclear how to
65 C best handle the unaligned case there.
67 C INPUT PARAMETERS
68 define(`rp', `%rdi')
69 define(`up', `%rsi')
70 define(`n', `%rdx')
72 ABI_SUPPORT(DOS64)
73 ABI_SUPPORT(STD64)
75 ASM_START()
76 TEXT
77 ALIGN(16)
78 PROLOGUE(mpn_com)
79 FUNC_ENTRY(3)
81 pcmpeqb %xmm7, %xmm7 C set to 111...111
83 test $8, R8(rp) C is rp 16-byte aligned?
84 jz L(ali) C jump if rp aligned
85 mov (up), %rax
86 lea 8(up), up
87 not %rax
88 mov %rax, (rp)
89 lea 8(rp), rp
90 dec n
92 sub $14, n
93 jc L(sma)
95 ALIGN(16)
96 L(top): movdqu (up), %xmm0
97 movdqu 16(up), %xmm1
98 movdqu 32(up), %xmm2
99 movdqu 48(up), %xmm3
100 movdqu 64(up), %xmm4
101 movdqu 80(up), %xmm5
102 movdqu 96(up), %xmm6
103 lea 112(up), up
104 pxor %xmm7, %xmm0
105 pxor %xmm7, %xmm1
106 pxor %xmm7, %xmm2
107 pxor %xmm7, %xmm3
108 pxor %xmm7, %xmm4
109 pxor %xmm7, %xmm5
110 pxor %xmm7, %xmm6
111 movdqa %xmm0, (rp)
112 movdqa %xmm1, 16(rp)
113 movdqa %xmm2, 32(rp)
114 movdqa %xmm3, 48(rp)
115 movdqa %xmm4, 64(rp)
116 movdqa %xmm5, 80(rp)
117 movdqa %xmm6, 96(rp)
118 lea 112(rp), rp
119 L(ali): sub $14, n
120 jnc L(top)
122 L(sma): add $14, n
123 test $8, R8(n)
124 jz 1f
125 movdqu (up), %xmm0
126 movdqu 16(up), %xmm1
127 movdqu 32(up), %xmm2
128 movdqu 48(up), %xmm3
129 lea 64(up), up
130 pxor %xmm7, %xmm0
131 pxor %xmm7, %xmm1
132 pxor %xmm7, %xmm2
133 pxor %xmm7, %xmm3
134 movdqa %xmm0, (rp)
135 movdqa %xmm1, 16(rp)
136 movdqa %xmm2, 32(rp)
137 movdqa %xmm3, 48(rp)
138 lea 64(rp), rp
140 test $4, R8(n)
141 jz 1f
142 movdqu (up), %xmm0
143 movdqu 16(up), %xmm1
144 lea 32(up), up
145 pxor %xmm7, %xmm0
146 pxor %xmm7, %xmm1
147 movdqa %xmm0, (rp)
148 movdqa %xmm1, 16(rp)
149 lea 32(rp), rp
151 test $2, R8(n)
152 jz 1f
153 movdqu (up), %xmm0
154 lea 16(up), up
155 pxor %xmm7, %xmm0
156 movdqa %xmm0, (rp)
157 lea 16(rp), rp
159 test $1, R8(n)
160 jz 1f
161 mov (up), %rax
162 not %rax
163 mov %rax, (rp)
165 L(don): FUNC_EXIT()
167 EPILOGUE()