beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k7 / mmx / popham.asm
blob95965b74d40046ab3acd767d9dbcf9b54aebe6ff
1 dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
2 dnl distance.
4 dnl Copyright 2000-2002 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
35 C popcount hamdist
36 C P3 generic 6.5 7
37 C P3 model 9 (Banias) 5.7 6.1
38 C P3 model 13 (Dothan) 5.75 6
39 C K7 5 6
41 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
42 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
44 C The code here is almost certainly not optimal, but is already a 3x speedup
45 C over the generic C code. The main improvement would be to interleave
46 C processing of two qwords in the loop so as to fully exploit the available
47 C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
49 C The loop is based on the example "Efficient 64-bit population count using
50 C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
51 C page 158 of rev E (reference in mpn/x86/k7/README).
53 ifdef(`OPERATION_popcount',,
54 `ifdef(`OPERATION_hamdist',,
55 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
56 ')')')
58 define(HAM,
59 m4_assert_numargs(1)
60 `ifdef(`OPERATION_hamdist',`$1')')
62 define(POP,
63 m4_assert_numargs(1)
64 `ifdef(`OPERATION_popcount',`$1')')
66 HAM(`
67 defframe(PARAM_SIZE, 12)
68 defframe(PARAM_SRC2, 8)
69 defframe(PARAM_SRC, 4)
70 define(M4_function,mpn_hamdist)
72 POP(`
73 defframe(PARAM_SIZE, 8)
74 defframe(PARAM_SRC, 4)
75 define(M4_function,mpn_popcount)
78 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
81 ifdef(`PIC',,`
82 dnl non-PIC
84 RODATA
85 ALIGN(8)
87 L(rodata_AAAAAAAAAAAAAAAA):
88 .long 0xAAAAAAAA
89 .long 0xAAAAAAAA
91 L(rodata_3333333333333333):
92 .long 0x33333333
93 .long 0x33333333
95 L(rodata_0F0F0F0F0F0F0F0F):
96 .long 0x0F0F0F0F
97 .long 0x0F0F0F0F
100 TEXT
101 ALIGN(32)
103 PROLOGUE(M4_function)
104 deflit(`FRAME',0)
106 movl PARAM_SIZE, %ecx
108 ifdef(`PIC',`
109 movl $0xAAAAAAAA, %eax
110 movl $0x33333333, %edx
112 movd %eax, %mm7
113 movd %edx, %mm6
115 movl $0x0F0F0F0F, %eax
117 punpckldq %mm7, %mm7
118 punpckldq %mm6, %mm6
120 movd %eax, %mm5
121 movd %edx, %mm4
123 punpckldq %mm5, %mm5
126 movq L(rodata_AAAAAAAAAAAAAAAA), %mm7
127 movq L(rodata_3333333333333333), %mm6
128 movq L(rodata_0F0F0F0F0F0F0F0F), %mm5
130 pxor %mm4, %mm4
132 define(REG_AAAAAAAAAAAAAAAA,%mm7)
133 define(REG_3333333333333333,%mm6)
134 define(REG_0F0F0F0F0F0F0F0F,%mm5)
135 define(REG_0000000000000000,%mm4)
138 movl PARAM_SRC, %eax
139 HAM(` movl PARAM_SRC2, %edx')
141 pxor %mm2, %mm2 C total
143 shrl %ecx
144 jnc L(top)
146 movd (%eax,%ecx,8), %mm1
148 HAM(` movd (%edx,%ecx,8), %mm0
149 pxor %mm0, %mm1
151 orl %ecx, %ecx
152 jmp L(loaded)
155 ALIGN(16)
156 L(top):
157 C eax src
158 C ebx
159 C ecx counter, qwords, decrementing
160 C edx [hamdist] src2
162 C mm0 (scratch)
163 C mm1 (scratch)
164 C mm2 total (low dword)
165 C mm3
166 C mm4 \
167 C mm5 | special constants
168 C mm6 |
169 C mm7 /
171 movq -8(%eax,%ecx,8), %mm1
173 HAM(` pxor -8(%edx,%ecx,8), %mm1')
174 decl %ecx
176 L(loaded):
177 movq %mm1, %mm0
178 pand REG_AAAAAAAAAAAAAAAA, %mm1
180 psrlq $1, %mm1
182 psubd %mm1, %mm0 C bit pairs
185 movq %mm0, %mm1
186 psrlq $2, %mm0
188 pand REG_3333333333333333, %mm0
189 pand REG_3333333333333333, %mm1
191 paddd %mm1, %mm0 C nibbles
194 movq %mm0, %mm1
195 psrlq $4, %mm0
197 pand REG_0F0F0F0F0F0F0F0F, %mm0
198 pand REG_0F0F0F0F0F0F0F0F, %mm1
200 paddd %mm1, %mm0 C bytes
203 psadbw( %mm4, %mm0)
205 paddd %mm0, %mm2 C add to total
206 jnz L(top)
209 movd %mm2, %eax
210 emms
213 EPILOGUE()