beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / fastavx / copyd.asm
blob56d472f83d2246d85704c2b5dade97487e10715a
1 dnl AMD64 mpn_copyd optimised for CPUs with fast AVX.
3 dnl Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc.
5 dnl Contributed to the GNU project by Torbjörn Granlund.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb cycles/limb cycles/limb good
36 C aligned unaligned best seen for cpu?
37 C AMD K8,K9 n/a
38 C AMD K10 n/a
39 C AMD bull n/a
40 C AMD pile 4.87 4.87 N
41 C AMD steam ? ?
42 C AMD bobcat n/a
43 C AMD jaguar n/a
44 C Intel P4 n/a
45 C Intel core n/a
46 C Intel NHM n/a
47 C Intel SBR 0.50 0.91 N
48 C Intel IBR 0.50 0.65 N
49 C Intel HWL 0.25 0.30 Y
50 C Intel BWL 0.28 0.37 Y
51 C Intel atom n/a
52 C VIA nano n/a
54 C We try to do as many 32-byte operations as possible. The top-most and
55 C bottom-most writes might need 8-byte operations. For the bulk copying, we
56 C write using aligned 32-byte operations, but we read with both aligned and
57 C unaligned 32-byte operations.
59 define(`rp', `%rdi')
60 define(`up', `%rsi')
61 define(`n', `%rdx')
63 ABI_SUPPORT(DOS64)
64 ABI_SUPPORT(STD64)
66 dnl define(`vmovdqu', vlddqu)
68 ASM_START()
69 TEXT
70 ALIGN(32)
71 PROLOGUE(mpn_copyd)
72 FUNC_ENTRY(3)
74 lea -32(rp,n,8), rp
75 lea -32(up,n,8), up
77 cmp $7, n C basecase needed for correctness
78 jbe L(bc)
80 test $8, R8(rp) C is rp 16-byte aligned?
81 jz L(a2) C jump if rp aligned
82 mov 24(up), %rax
83 lea -8(up), up
84 mov %rax, 24(rp)
85 lea -8(rp), rp
86 dec n
87 L(a2): test $16, R8(rp) C is rp 32-byte aligned?
88 jz L(a3) C jump if rp aligned
89 vmovdqu 16(up), %xmm0
90 lea -16(up), up
91 vmovdqa %xmm0, 16(rp)
92 lea -16(rp), rp
93 sub $2, n
94 L(a3): sub $16, n
95 jc L(sma)
97 ALIGN(16)
98 L(top): vmovdqu (up), %ymm0
99 vmovdqu -32(up), %ymm1
100 vmovdqu -64(up), %ymm2
101 vmovdqu -96(up), %ymm3
102 lea -128(up), up
103 vmovdqa %ymm0, (rp)
104 vmovdqa %ymm1, -32(rp)
105 vmovdqa %ymm2, -64(rp)
106 vmovdqa %ymm3, -96(rp)
107 lea -128(rp), rp
108 L(ali): sub $16, n
109 jnc L(top)
111 L(sma): test $8, R8(n)
112 jz 1f
113 vmovdqu (up), %ymm0
114 vmovdqu -32(up), %ymm1
115 lea -64(up), up
116 vmovdqa %ymm0, (rp)
117 vmovdqa %ymm1, -32(rp)
118 lea -64(rp), rp
120 test $4, R8(n)
121 jz 1f
122 vmovdqu (up), %ymm0
123 lea -32(up), up
124 vmovdqa %ymm0, (rp)
125 lea -32(rp), rp
127 test $2, R8(n)
128 jz 1f
129 vmovdqu 16(up), %xmm0
130 lea -16(up), up
131 vmovdqa %xmm0, 16(rp)
132 lea -16(rp), rp
134 test $1, R8(n)
135 jz 1f
136 mov 24(up), %r8
137 mov %r8, 24(rp)
139 FUNC_EXIT()
142 ALIGN(16)
143 L(bc): test $4, R8(n)
144 jz 1f
145 mov 24(up), %rax
146 mov 16(up), %rcx
147 mov 8(up), %r8
148 mov (up), %r9
149 lea -32(up), up
150 mov %rax, 24(rp)
151 mov %rcx, 16(rp)
152 mov %r8, 8(rp)
153 mov %r9, (rp)
154 lea -32(rp), rp
156 test $2, R8(n)
157 jz 1f
158 mov 24(up), %rax
159 mov 16(up), %rcx
160 lea -16(up), up
161 mov %rax, 24(rp)
162 mov %rcx, 16(rp)
163 lea -16(rp), rp
165 test $1, R8(n)
166 jz 1f
167 mov 24(up), %rax
168 mov %rax, 24(rp)
170 FUNC_EXIT()
172 EPILOGUE()