1 dnl Intel P5 mpn_hamdist
-- mpn hamming distance.
3 dnl Copyright
2001, 2002, 2014, 2015 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
34 C P5: 14.0 cycles/limb
37 C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size);
39 C It might be possible to shave 1 cycle from the loop, and hence 2
40 C cycles/limb. The xorb is taking 2 cycles, but a separate load and xor
41 C would be 1, if the right schedule could be found (not found so far).
42 C Wanting to avoid potential cache bank clashes makes it tricky.
44 C The slightly strange quoting here helps the renaming done by tune/many.pl.
46 m4_assert_defined(`GSYM_PREFIX')
47 GSYM_PREFIX`
'mpn_popcount``'_table
')
49 C FIXME: referencing popcount.asm's table is incorrect as it hurt incremental
52 defframe
(PARAM_SIZE
,12)
53 defframe
(PARAM_SRC2
, 8)
54 defframe
(PARAM_SRC1
, 4)
63 pushl %esi FRAME_pushl()
65 shll %ecx C size in byte pairs
66 pushl %edi FRAME_pushl()
69 pushl
%ebx FRAME_pushl
()
70 pushl
%ebp FRAME_pushl
()
74 LEA( TABLE_NAME, %ebp)
75 xorl %ebx, %ebx C byte
76 xorl %edx, %edx C byte
77 xorl %eax, %eax C total
79 call L
(here
) FRAME_pushl
()
82 popl
%ebp FRAME_popl
()
85 addl $_GLOBAL_OFFSET_TABLE_
+[.
-L
(here
)], %ebp
87 xorl
%ebx, %ebx C
byte
88 xorl
%edx, %edx C
byte
90 movl TABLE_NAME
@GOT(%ebp), %ebp
91 xorl
%eax, %eax C total
93 define(TABLE,`(%ebp,$1)')
99 xorl %eax, %eax C total
100 pushl %ebx FRAME_pushl()
102 xorl %edx, %edx C byte
103 xorl %ebx, %ebx C byte
105 define(TABLE,`TABLE_NAME($1)')
109 C The nop after the xorb seems necessary. Although a movb might be
110 C expected to go down the V pipe in the second cycle of the xorb, it
111 C doesn't
and costs an extra
2 cycles.
115 C
ecx counter
, 2*size to
2
122 movb
-1(%esi,%ecx,2), %bl
125 movb
-1(%edi,%ecx,2), %dl
128 movb
-2(%esi,%ecx,2), %dl
130 xorb
-2(%edi,%ecx,2), %dl
133 movb TABLE
(%ebx), %bl
136 movb TABLE
(%edx), %dl