1 dnl IA
-64 mpn_hamdist
-- mpn hamming distance.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright
2003-2005 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
44 define(`u0',`r16
') define(`u1',`r17
') define(`u2',`r18
') define(`u3',`r19
')
45 define(`v0',`r20
') define(`v1',`r21
') define(`v2',`r22
') define(`v3',`r23
')
46 define(`x0',`r24
') define(`x1',`r25
') define(`x2',`r26
') define(`x3',`r27
')
47 define(`c0',`r28
') define(`c1',`r29
') define(`c2',`r30
') define(`c3',`r31
')
55 ` addp4 up
= 0, up C M I
56 addp4 vp
= 0, vp C M I
61 {.mmi; ld8 r10 = [up], 8 C load first ulimb M01
62 ld8 r11 = [vp], 8 C load first vlimb M01
63 mov.i r2 = ar.lc C save ar.lc I0
64 }{.mmi; and r14 = 3, n C M I
65 cmp.lt p15, p0 = 4, n C small count? M I
68 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I
69 cmp.eq p7, p0 = 2, r14 C M I
70 cmp.eq p8, p0 = 3, r14 C M I
72 (p6) br.dptk .Lb01 C B
73 (p7) br.dptk .Lb10 C B
74 (p8) br.dptk .Lb11 C B
78 .Lb00: ld8 u1 = [up], 8 C M01
79 ld8 v1 = [vp], 8 C M01
81 xor x0 = r10, r11 C M I
83 ld8 u2 = [up], 8 C M01
84 ld8 v2 = [vp], 8 C M01
88 ld8 u3 = [up], 8 C M01
89 ld8 v3 = [vp], 8 C M01
92 (p15) br.cond.dptk .grt4 C B
102 .grt4: ld8 u0 = [up], 8 C M01
103 ld8 v0 = [vp], 8 C M01
104 xor x1 = u1, v1 C M I
106 ld8 u1 = [up], 8 C M01
107 ld8 v1 = [vp], 8 C M01
108 xor x2 = u2, v2 C M I
110 ld8 u2 = [up], 8 C M01
111 ld8 v2 = [vp], 8 C M01
113 xor x3 = u3, v3 C M I
115 ld8 u3 = [up], 8 C M01
116 ld8 v3 = [vp], 8 C M01
118 xor x0 = u0, v0 C M I
119 br.cloop.dpnt .grt8 C B
122 xor x1 = u1, v1 C M I
125 .grt8: ld8 u0 = [up], 8 C M01
126 ld8 v0 = [vp], 8 C M01
128 xor x1 = u1, v1 C M I
132 .Lb01: xor x3 = r10, r11 C M I
134 (p15) br.cond.dptk .grt1 C B
137 br.ret.sptk.many b0 C B
139 .grt1: ld8 u0 = [up], 8 C M01
140 ld8 v0 = [vp], 8 C M01
143 ld8 u1 = [up], 8 C M01
144 ld8 v1 = [vp], 8 C M01
147 ld8 u2 = [up], 8 C M01
148 ld8 v2 = [vp], 8 C M01
150 ld8 u3 = [up], 8 C M01
151 ld8 v3 = [vp], 8 C M01
152 xor x0 = u0, v0 C M I
153 br.cloop.dpnt .grt5 C B
155 xor x1 = u1, v1 C M I
158 xor x2 = u2, v2 C M I
161 xor x3 = u3, v3 C M I
166 .grt5: ld8 u0 = [up], 8 C M01
167 ld8 v0 = [vp], 8 C M01
168 xor x1 = u1, v1 C M I
170 ld8 u1 = [up], 8 C M01
171 ld8 v1 = [vp], 8 C M01
173 xor x2 = u2, v2 C M I
175 ld8 u2 = [up], 8 C M01
176 ld8 v2 = [vp], 8 C M01
178 xor x3 = u3, v3 C M I
180 ld8 u3 = [up], 8 C M01
181 ld8 v3 = [vp], 8 C M01
183 xor x0 = u0, v0 C M I
184 br.cloop.dpnt .Loop C B
188 .Lb10: ld8 u3 = [up], 8 C M01
189 ld8 v3 = [vp], 8 C M01
190 xor x2 = r10, r11 C M I
191 (p15) br.cond.dptk .grt2 C B
193 xor x3 = u3, v3 C M I
200 br.ret.sptk.many b0 C B
202 .grt2: ld8 u0 = [up], 8 C M01
203 ld8 v0 = [vp], 8 C M01
206 ld8 u1 = [up], 8 C M01
207 ld8 v1 = [vp], 8 C M01
211 ld8 u2 = [up], 8 C M01
212 ld8 v2 = [vp], 8 C M01
213 xor x3 = u3, v3 C M I
215 ld8 u3 = [up], 8 C M01
216 ld8 v3 = [vp], 8 C M01
217 xor x0 = u0, v0 C M I
218 br.cloop.dptk .grt6 C B
221 xor x1 = u1, v1 C M I
224 xor x2 = u2, v2 C M I
227 xor x3 = u3, v3 C M I
230 .grt6: ld8 u0 = [up], 8 C M01
231 ld8 v0 = [vp], 8 C M01
233 xor x1 = u1, v1 C M I
235 ld8 u1 = [up], 8 C M01
236 ld8 v1 = [vp], 8 C M01
238 xor x2 = u2, v2 C M I
240 ld8 u2 = [up], 8 C M01
241 ld8 v2 = [vp], 8 C M01
243 xor x3 = u3, v3 C M I
247 .Lb11: ld8 u2 = [up], 8 C M01
248 ld8 v2 = [vp], 8 C M01
250 xor x1 = r10, r11 C M I
252 ld8 u3 = [up], 8 C M01
253 ld8 v3 = [vp], 8 C M01
254 xor x2 = u2, v2 C M I
255 (p15) br.cond.dptk .grt3 C B
257 xor x3 = u3, v3 C M I
268 br.ret.sptk.many b0 C B
270 .grt3: ld8 u0 = [up], 8 C M01
271 ld8 v0 = [vp], 8 C M01
274 ld8 u1 = [up], 8 C M01
275 ld8 v1 = [vp], 8 C M01
278 ld8 u2 = [up], 8 C M01
279 ld8 v2 = [vp], 8 C M01
280 xor x3 = u3, v3 C M I
282 ld8 u3 = [up], 8 C M01
283 ld8 v3 = [vp], 8 C M01
285 xor x0 = u0, v0 C M I
286 br.cloop.dptk .grt7 C B
288 xor x1 = u1, v1 C M I
291 xor x2 = u2, v2 C M I
294 .grt7: ld8 u0 = [up], 8 C M01
295 ld8 v0 = [vp], 8 C M01
297 xor x1 = u1, v1 C M I
299 ld8 u1 = [up], 8 C M01
300 ld8 v1 = [vp], 8 C M01
302 xor x2 = u2, v2 C M I
307 .Loop: ld8 u0 = [up], 8 C M01
308 ld8 v0 = [vp], 8 C M01
311 xor x1 = u1, v1 C M I
314 .LL00: ld8 u1 = [up], 8 C M01
315 ld8 v1 = [vp], 8 C M01
318 xor x2 = u2, v2 C M I
321 .LL11: ld8 u2 = [up], 8 C M01
322 ld8 v2 = [vp], 8 C M01
325 xor x3 = u3, v3 C M I
328 .LL10: ld8 u3 = [up], 8 C M01
329 ld8 v3 = [vp], 8 C M01
332 xor x0 = u0, v0 C M I
333 br.cloop.dptk .Loop C B
336 .Lend: popcnt c2 = x2 C I0
338 xor x1 = u1, v1 C M I
340 .Lcj8: popcnt c3 = x3 C I0
342 xor x2 = u2, v2 C M I
344 .Lcj7: popcnt c0 = x0 C I0
346 xor x3 = u3, v3 C M I
348 .Lcj6: popcnt c1 = x1 C I0
351 .Lcj5: popcnt c2 = x2 C I0
354 .Lcj4: popcnt c3 = x3 C I0
362 mov.i ar.lc = r2 C I0
363 br.ret.sptk.many b0 C B