beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium / hamdist.asm
blob6c6c1a12583bf40e55af3f2f358bdbf43d999a55
1 dnl Intel P5 mpn_hamdist -- mpn hamming distance.
3 dnl Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C P5: 14.0 cycles/limb
37 C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size);
39 C It might be possible to shave 1 cycle from the loop, and hence 2
40 C cycles/limb. The xorb is taking 2 cycles, but a separate load and xor
41 C would be 1, if the right schedule could be found (not found so far).
42 C Wanting to avoid potential cache bank clashes makes it tricky.
44 C The slightly strange quoting here helps the renaming done by tune/many.pl.
45 deflit(TABLE_NAME,
46 m4_assert_defined(`GSYM_PREFIX')
47 GSYM_PREFIX`'mpn_popcount``'_table')
49 C FIXME: referencing popcount.asm's table is incorrect as it hurt incremental
50 C linking.
52 defframe(PARAM_SIZE,12)
53 defframe(PARAM_SRC2, 8)
54 defframe(PARAM_SRC1, 4)
56 TEXT
57 ALIGN(8)
59 PROLOGUE(mpn_hamdist)
60 deflit(`FRAME',0)
62 movl PARAM_SIZE, %ecx
63 pushl %esi FRAME_pushl()
65 shll %ecx C size in byte pairs
66 pushl %edi FRAME_pushl()
68 ifdef(`PIC',`
69 pushl %ebx FRAME_pushl()
70 pushl %ebp FRAME_pushl()
71 ifdef(`DARWIN',`
72 movl PARAM_SRC1, %esi
73 movl PARAM_SRC2, %edi
74 LEA( TABLE_NAME, %ebp)
75 xorl %ebx, %ebx C byte
76 xorl %edx, %edx C byte
77 xorl %eax, %eax C total
78 ',`
79 call L(here) FRAME_pushl()
80 L(here):
81 movl PARAM_SRC1, %esi
82 popl %ebp FRAME_popl()
84 movl PARAM_SRC2, %edi
85 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
87 xorl %ebx, %ebx C byte
88 xorl %edx, %edx C byte
90 movl TABLE_NAME@GOT(%ebp), %ebp
91 xorl %eax, %eax C total
93 define(TABLE,`(%ebp,$1)')
94 ',`
95 dnl non-PIC
96 movl PARAM_SRC1, %esi
97 movl PARAM_SRC2, %edi
99 xorl %eax, %eax C total
100 pushl %ebx FRAME_pushl()
102 xorl %edx, %edx C byte
103 xorl %ebx, %ebx C byte
105 define(TABLE,`TABLE_NAME($1)')
109 C The nop after the xorb seems necessary. Although a movb might be
110 C expected to go down the V pipe in the second cycle of the xorb, it
111 C doesn't and costs an extra 2 cycles.
112 L(top):
113 C eax total
114 C ebx byte
115 C ecx counter, 2*size to 2
116 C edx byte
117 C esi src1
118 C edi src2
119 C ebp [PIC] table
121 addl %ebx, %eax
122 movb -1(%esi,%ecx,2), %bl
124 addl %edx, %eax
125 movb -1(%edi,%ecx,2), %dl
127 xorb %dl, %bl
128 movb -2(%esi,%ecx,2), %dl
130 xorb -2(%edi,%ecx,2), %dl
133 movb TABLE(%ebx), %bl
134 decl %ecx
136 movb TABLE(%edx), %dl
137 jnz L(top)
140 ifdef(`PIC',`
141 popl %ebp
143 addl %ebx, %eax
144 popl %ebx
146 addl %edx, %eax
147 popl %edi
149 popl %esi
153 EPILOGUE()
154 ASM_END()