beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / popham.asm
blob9005f817762fd37e961d0c8e0f0f71f341562fba
1 dnl AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance.
3 dnl Copyright 2004, 2005, 2007, 2010-2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
35 C popcount hamdist
36 C cycles/limb cycles/limb
37 C AMD K8,K9 6 7
38 C AMD K10 6 7
39 C Intel P4 12 14.3
40 C Intel core2 7 8
41 C Intel corei ? 7.3
42 C Intel atom 16.5 17.5
43 C VIA nano 8.75 10.4
45 C TODO
46 C * Tune. It should be possible to reach 5 c/l for popcount and 6 c/l for
47 C hamdist for K8/K9.
50 ifdef(`OPERATION_popcount',`
51 define(`func',`mpn_popcount')
52 define(`up', `%rdi')
53 define(`n', `%rsi')
54 define(`h55555555', `%r10')
55 define(`h33333333', `%r11')
56 define(`h0f0f0f0f', `%rcx')
57 define(`h01010101', `%rdx')
58 define(`POP', `$1')
59 define(`HAM', `dnl')
61 ifdef(`OPERATION_hamdist',`
62 define(`func',`mpn_hamdist')
63 define(`up', `%rdi')
64 define(`vp', `%rsi')
65 define(`n', `%rdx')
66 define(`h55555555', `%r10')
67 define(`h33333333', `%r11')
68 define(`h0f0f0f0f', `%rcx')
69 define(`h01010101', `%r14')
70 define(`POP', `dnl')
71 define(`HAM', `$1')
75 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
77 ABI_SUPPORT(DOS64)
78 ABI_SUPPORT(STD64)
80 ASM_START()
81 TEXT
82 ALIGN(32)
83 PROLOGUE(func)
84 POP(` FUNC_ENTRY(2) ')
85 HAM(` FUNC_ENTRY(3) ')
86 push %r12
87 push %r13
88 HAM(` push %r14 ')
90 mov $0x5555555555555555, h55555555
91 mov $0x3333333333333333, h33333333
92 mov $0x0f0f0f0f0f0f0f0f, h0f0f0f0f
93 mov $0x0101010101010101, h01010101
95 lea (up,n,8), up
96 HAM(` lea (vp,n,8), vp ')
97 neg n
99 xor R32(%rax), R32(%rax)
101 bt $0, R32(n)
102 jnc L(top)
104 mov (up,n,8), %r8
105 HAM(` xor (vp,n,8), %r8 ')
107 mov %r8, %r9
108 shr %r8
109 and h55555555, %r8
110 sub %r8, %r9
112 mov %r9, %r8
113 shr $2, %r9
114 and h33333333, %r8
115 and h33333333, %r9
116 add %r8, %r9 C 16 4-bit fields (0..4)
118 mov %r9, %r8
119 shr $4, %r9
120 and h0f0f0f0f, %r8
121 and h0f0f0f0f, %r9
122 add %r8, %r9 C 8 8-bit fields (0..16)
124 imul h01010101, %r9 C sum the 8 fields in high 8 bits
125 shr $56, %r9
127 mov %r9, %rax C add to total
128 add $1, n
129 jz L(end)
131 ALIGN(16)
132 L(top): mov (up,n,8), %r8
133 mov 8(up,n,8), %r12
134 HAM(` xor (vp,n,8), %r8 ')
135 HAM(` xor 8(vp,n,8), %r12 ')
137 mov %r8, %r9
138 mov %r12, %r13
139 shr %r8
140 shr %r12
141 and h55555555, %r8
142 and h55555555, %r12
143 sub %r8, %r9
144 sub %r12, %r13
146 mov %r9, %r8
147 mov %r13, %r12
148 shr $2, %r9
149 shr $2, %r13
150 and h33333333, %r8
151 and h33333333, %r9
152 and h33333333, %r12
153 and h33333333, %r13
154 add %r8, %r9 C 16 4-bit fields (0..4)
155 add %r12, %r13 C 16 4-bit fields (0..4)
157 add %r13, %r9 C 16 4-bit fields (0..8)
158 mov %r9, %r8
159 shr $4, %r9
160 and h0f0f0f0f, %r8
161 and h0f0f0f0f, %r9
162 add %r8, %r9 C 8 8-bit fields (0..16)
164 imul h01010101, %r9 C sum the 8 fields in high 8 bits
165 shr $56, %r9
167 add %r9, %rax C add to total
168 add $2, n
169 jnc L(top)
171 L(end):
172 HAM(` pop %r14 ')
173 pop %r13
174 pop %r12
175 FUNC_EXIT()
177 EPILOGUE()