1 dnl AMD K7 mpn_popcount
, mpn_hamdist
-- population count
and hamming
4 dnl Copyright
2000-2002 Free Software Foundation
, Inc.
6 dnl
This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
9 dnl it under the terms of
either:
11 dnl
* the GNU Lesser General
Public License as published by the Free
12 dnl Software Foundation
; either version 3 of the License, or (at your
13 dnl option
) any later version.
17 dnl
* the GNU General
Public License as published by the Free Software
18 dnl Foundation
; either version 2 of the License, or (at your option) any
21 dnl
or both
in parallel
, as here.
23 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
24 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
25 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
28 dnl You should have received copies of the GNU General
Public License
and the
29 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
30 dnl see
https://www.gnu.
org/licenses
/.
32 include(`..
/config.m4
')
37 C P3 model 9 (Banias) 5.7 6.1
38 C P3 model 13 (Dothan) 5.75 6
41 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
42 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
44 C The code here is almost certainly not optimal, but is already a 3x speedup
45 C over the generic C code. The main improvement would be to interleave
46 C processing of two qwords in the loop so as to fully exploit the available
47 C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
49 C The loop is based on the example "Efficient 64-bit population count using
50 C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
51 C page 158 of rev E (reference in mpn/x86/k7/README).
53 ifdef(`OPERATION_popcount',,
54 `ifdef
(`OPERATION_hamdist
',,
55 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
60 `ifdef
(`OPERATION_hamdist
',`$1')')
64 `ifdef(`OPERATION_popcount',`
$1')')
67 defframe
(PARAM_SIZE
, 12)
68 defframe
(PARAM_SRC2
, 8)
69 defframe
(PARAM_SRC
, 4)
70 define
(M4_function
,mpn_hamdist
)
73 defframe(PARAM_SIZE, 8)
74 defframe(PARAM_SRC, 4)
75 define(M4_function,mpn_popcount)
78 MULFUNC_PROLOGUE
(mpn_popcount mpn_hamdist
)
87 L(rodata_AAAAAAAAAAAAAAAA):
91 L(rodata_3333333333333333):
95 L(rodata_0F0F0F0F0F0F0F0F):
103 PROLOGUE
(M4_function
)
106 movl PARAM_SIZE, %ecx
109 movl
$0xAAAAAAAA
, %eax
110 movl
$0x33333333
, %edx
115 movl
$0x0F0F0F0F
, %eax
126 movq L(rodata_AAAAAAAAAAAAAAAA), %mm7
127 movq L(rodata_3333333333333333), %mm6
128 movq L(rodata_0F0F0F0F0F0F0F0F), %mm5
132 define
(REG_AAAAAAAAAAAAAAAA
,%mm7
)
133 define
(REG_3333333333333333
,%mm6
)
134 define
(REG_0F0F0F0F0F0F0F0F
,%mm5
)
135 define
(REG_0000000000000000
,%mm4
)
139 HAM
(` movl PARAM_SRC2
, %edx')
141 pxor %mm2, %mm2 C total
146 movd (%eax,%ecx,8), %mm1
148 HAM(` movd (%edx,%ecx,8), %mm0
159 C
ecx counter
, qwords
, decrementing
164 C mm2 total
(low dword)
167 C mm5 | special constants
171 movq
-8(%eax,%ecx,8), %mm1
173 HAM
(` pxor
-8(%edx,%ecx,8), %mm1
')
178 pand REG_AAAAAAAAAAAAAAAA, %mm1
182 psubd %mm1, %mm0 C bit pairs
188 pand REG_3333333333333333, %mm0
189 pand REG_3333333333333333, %mm1
191 paddd %mm1, %mm0 C nibbles
197 pand REG_0F0F0F0F0F0F0F0F, %mm0
198 pand REG_0F0F0F0F0F0F0F0F, %mm1
200 paddd %mm1, %mm0 C bytes
205 paddd %mm0, %mm2 C add to total