1 dnl AMD K6
-2 mpn_popcount
, mpn_hamdist
-- mpn bit population count
and
4 dnl Copyright
2000-2002 Free Software Foundation
, Inc.
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
36 C K6-2: 9.0 11.5 cycles/limb
40 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
41 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
43 C The code here isn't optimal
, but it
's already a 2x speedup over the plain
44 C integer mpn/generic/popcount.c,hamdist.c.
47 ifdef(`OPERATION_popcount',,
48 `ifdef
(`OPERATION_hamdist
',,
49 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist
54 `ifdef
(`OPERATION_hamdist
',`$1')')
58 `ifdef(`OPERATION_popcount',`
$1')')
61 defframe
(PARAM_SIZE
, 12)
62 defframe
(PARAM_SRC2
, 8)
63 defframe
(PARAM_SRC
, 4)
64 define
(M4_function
,mpn_hamdist
)
67 defframe(PARAM_SIZE, 8)
68 defframe(PARAM_SRC, 4)
69 define(M4_function,mpn_popcount)
72 MULFUNC_PROLOGUE
(mpn_popcount mpn_hamdist
)
81 L(rodata_AAAAAAAAAAAAAAAA):
85 L(rodata_3333333333333333):
89 L(rodata_0F0F0F0F0F0F0F0F):
93 L(rodata_000000FF000000FF):
102 C avoid shrl crossing a 32-byte boundary
105 PROLOGUE(M4_function)
108 movl PARAM_SIZE
, %ecx
111 movl $0xAAAAAAAA, %eax
112 movl $0x33333333, %edx
117 movl $0x0F0F0F0F, %eax
118 movl $0x000000FF, %edx
130 movq L
(rodata_AAAAAAAAAAAAAAAA
), %mm7
131 movq L
(rodata_3333333333333333
), %mm6
132 movq L
(rodata_0F0F0F0F0F0F0F0F
), %mm5
133 movq L
(rodata_000000FF000000FF
), %mm4
136 define(REG_AAAAAAAAAAAAAAAA, %mm7)
137 define(REG_3333333333333333, %mm6)
138 define(REG_0F0F0F0F0F0F0F0F, %mm5)
139 define(REG_000000FF000000FF, %mm4)
143 HAM(` movl PARAM_SRC2, %edx')
145 pxor
%mm2
, %mm2 C total
150 Zdisp
( movd
, 0,(%eax,%ecx,8), %mm1
)
153 Zdisp
( movd
, 0,(%edx,%ecx,8), %mm0
)
162 POP(` nop C alignment to avoid crossing 32-byte boundaries')
167 C
ecx counter
, qwords
, decrementing
172 C mm2 total
(low dword)
175 C mm5 | special constants
179 movq
-8(%eax,%ecx,8), %mm1
180 HAM
(` pxor
-8(%edx,%ecx,8), %mm1
')
184 pand REG_AAAAAAAAAAAAAAAA, %mm1
187 HAM(` nop C code alignment')
189 psubd
%mm1
, %mm0 C bit pairs
190 HAM
(`
nop C code alignment
')
196 pand REG_3333333333333333, %mm0
197 pand REG_3333333333333333, %mm1
199 paddd %mm1, %mm0 C nibbles
205 pand REG_0F0F0F0F0F0F0F0F, %mm0
206 pand REG_0F0F0F0F0F0F0F0F, %mm1
208 paddd %mm1, %mm0 C bytes
214 paddb %mm1, %mm0 C words
220 paddd %mm1, %mm0 C dwords
222 pand REG_000000FF000000FF, %mm0
224 paddd %mm0, %mm2 C low to total
227 paddd %mm0, %mm2 C high to total