beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k6 / mmx / popham.asm
blob2b19d0b5ee0a7796cb03a680158503d5597e0414
1 dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
2 dnl hamming distance.
4 dnl Copyright 2000-2002 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
35 C popcount hamdist
36 C K6-2: 9.0 11.5 cycles/limb
37 C K6: 12.5 13.0
40 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
41 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
43 C The code here isn't optimal, but it's already a 2x speedup over the plain
44 C integer mpn/generic/popcount.c,hamdist.c.
47 ifdef(`OPERATION_popcount',,
48 `ifdef(`OPERATION_hamdist',,
49 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist
50 ')m4exit(1)')')
52 define(HAM,
53 m4_assert_numargs(1)
54 `ifdef(`OPERATION_hamdist',`$1')')
56 define(POP,
57 m4_assert_numargs(1)
58 `ifdef(`OPERATION_popcount',`$1')')
60 HAM(`
61 defframe(PARAM_SIZE, 12)
62 defframe(PARAM_SRC2, 8)
63 defframe(PARAM_SRC, 4)
64 define(M4_function,mpn_hamdist)
66 POP(`
67 defframe(PARAM_SIZE, 8)
68 defframe(PARAM_SRC, 4)
69 define(M4_function,mpn_popcount)
72 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
75 ifdef(`PIC',,`
76 dnl non-PIC
78 RODATA
79 ALIGN(8)
81 L(rodata_AAAAAAAAAAAAAAAA):
82 .long 0xAAAAAAAA
83 .long 0xAAAAAAAA
85 L(rodata_3333333333333333):
86 .long 0x33333333
87 .long 0x33333333
89 L(rodata_0F0F0F0F0F0F0F0F):
90 .long 0x0F0F0F0F
91 .long 0x0F0F0F0F
93 L(rodata_000000FF000000FF):
94 .long 0x000000FF
95 .long 0x000000FF
98 TEXT
99 ALIGN(32)
101 POP(`ifdef(`PIC', `
102 C avoid shrl crossing a 32-byte boundary
103 nop')')
105 PROLOGUE(M4_function)
106 deflit(`FRAME',0)
108 movl PARAM_SIZE, %ecx
110 ifdef(`PIC',`
111 movl $0xAAAAAAAA, %eax
112 movl $0x33333333, %edx
114 movd %eax, %mm7
115 movd %edx, %mm6
117 movl $0x0F0F0F0F, %eax
118 movl $0x000000FF, %edx
120 punpckldq %mm7, %mm7
121 punpckldq %mm6, %mm6
123 movd %eax, %mm5
124 movd %edx, %mm4
126 punpckldq %mm5, %mm5
127 punpckldq %mm4, %mm4
130 movq L(rodata_AAAAAAAAAAAAAAAA), %mm7
131 movq L(rodata_3333333333333333), %mm6
132 movq L(rodata_0F0F0F0F0F0F0F0F), %mm5
133 movq L(rodata_000000FF000000FF), %mm4
136 define(REG_AAAAAAAAAAAAAAAA, %mm7)
137 define(REG_3333333333333333, %mm6)
138 define(REG_0F0F0F0F0F0F0F0F, %mm5)
139 define(REG_000000FF000000FF, %mm4)
142 movl PARAM_SRC, %eax
143 HAM(` movl PARAM_SRC2, %edx')
145 pxor %mm2, %mm2 C total
147 shrl %ecx
148 jnc L(top)
150 Zdisp( movd, 0,(%eax,%ecx,8), %mm1)
152 HAM(`
153 Zdisp( movd, 0,(%edx,%ecx,8), %mm0)
154 pxor %mm0, %mm1
157 incl %ecx
158 jmp L(loaded)
161 ALIGN(16)
162 POP(` nop C alignment to avoid crossing 32-byte boundaries')
164 L(top):
165 C eax src
166 C ebx
167 C ecx counter, qwords, decrementing
168 C edx [hamdist] src2
170 C mm0 (scratch)
171 C mm1 (scratch)
172 C mm2 total (low dword)
173 C mm3
174 C mm4 \
175 C mm5 | special constants
176 C mm6 |
177 C mm7 /
179 movq -8(%eax,%ecx,8), %mm1
180 HAM(` pxor -8(%edx,%ecx,8), %mm1')
182 L(loaded):
183 movq %mm1, %mm0
184 pand REG_AAAAAAAAAAAAAAAA, %mm1
186 psrlq $1, %mm1
187 HAM(` nop C code alignment')
189 psubd %mm1, %mm0 C bit pairs
190 HAM(` nop C code alignment')
193 movq %mm0, %mm1
194 psrlq $2, %mm0
196 pand REG_3333333333333333, %mm0
197 pand REG_3333333333333333, %mm1
199 paddd %mm1, %mm0 C nibbles
202 movq %mm0, %mm1
203 psrlq $4, %mm0
205 pand REG_0F0F0F0F0F0F0F0F, %mm0
206 pand REG_0F0F0F0F0F0F0F0F, %mm1
208 paddd %mm1, %mm0 C bytes
210 movq %mm0, %mm1
211 psrlq $8, %mm0
214 paddb %mm1, %mm0 C words
217 movq %mm0, %mm1
218 psrlq $16, %mm0
220 paddd %mm1, %mm0 C dwords
222 pand REG_000000FF000000FF, %mm0
224 paddd %mm0, %mm2 C low to total
225 psrlq $32, %mm0
227 paddd %mm0, %mm2 C high to total
228 loop L(top)
232 movd %mm2, %eax
233 emms_or_femms
236 EPILOGUE()