new beta-0.90.0
[luatex.git] / source / libs / gmp / gmp-src / mpn / ia64 / hamdist.asm
blob477df4cd7183bc362f9fb1c140f3014a34867e49
1 dnl IA-64 mpn_hamdist -- mpn hamming distance.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2003-2005 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C Itanium: 2
37 C Itanium 2: 1
39 C INPUT PARAMETERS
40 define(`up', `r32')
41 define(`vp', `r33')
42 define(`n', `r34')
44 define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
45 define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
46 define(`x0',`r24') define(`x1',`r25') define(`x2',`r26') define(`x3',`r27')
47 define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
48 define(`s',`r8')
51 ASM_START()
52 PROLOGUE(mpn_hamdist)
53 .prologue
54 ifdef(`HAVE_ABI_32',
55 ` addp4 up = 0, up C M I
56 addp4 vp = 0, vp C M I
57 zxt4 n = n C I
61 {.mmi; ld8 r10 = [up], 8 C load first ulimb M01
62 ld8 r11 = [vp], 8 C load first vlimb M01
63 mov.i r2 = ar.lc C save ar.lc I0
64 }{.mmi; and r14 = 3, n C M I
65 cmp.lt p15, p0 = 4, n C small count? M I
66 add n = -5, n C M I
68 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I
69 cmp.eq p7, p0 = 2, r14 C M I
70 cmp.eq p8, p0 = 3, r14 C M I
71 }{.bbb
72 (p6) br.dptk .Lb01 C B
73 (p7) br.dptk .Lb10 C B
74 (p8) br.dptk .Lb11 C B
78 .Lb00: ld8 u1 = [up], 8 C M01
79 ld8 v1 = [vp], 8 C M01
80 shr.u n = n, 2 C I0
81 xor x0 = r10, r11 C M I
83 ld8 u2 = [up], 8 C M01
84 ld8 v2 = [vp], 8 C M01
85 mov.i ar.lc = n C I0
86 xor x1 = u1, v1 C M I
88 ld8 u3 = [up], 8 C M01
89 ld8 v3 = [vp], 8 C M01
90 xor x2 = u2, v2 C M I
91 mov s = 0 C M I
92 (p15) br.cond.dptk .grt4 C B
94 popcnt c0 = x0 C I0
95 xor x3 = u3, v3 C M I
97 popcnt c1 = x1 C I0
99 popcnt c2 = x2 C I0
100 br .Lcj4 C B
102 .grt4: ld8 u0 = [up], 8 C M01
103 ld8 v0 = [vp], 8 C M01
104 xor x1 = u1, v1 C M I
106 ld8 u1 = [up], 8 C M01
107 ld8 v1 = [vp], 8 C M01
108 xor x2 = u2, v2 C M I
110 ld8 u2 = [up], 8 C M01
111 ld8 v2 = [vp], 8 C M01
112 popcnt c0 = x0 C I0
113 xor x3 = u3, v3 C M I
115 ld8 u3 = [up], 8 C M01
116 ld8 v3 = [vp], 8 C M01
117 popcnt c1 = x1 C I0
118 xor x0 = u0, v0 C M I
119 br.cloop.dpnt .grt8 C B
121 popcnt c2 = x2 C I0
122 xor x1 = u1, v1 C M I
123 br .Lcj8 C B
125 .grt8: ld8 u0 = [up], 8 C M01
126 ld8 v0 = [vp], 8 C M01
127 popcnt c2 = x2 C I0
128 xor x1 = u1, v1 C M I
129 br .LL00 C B
132 .Lb01: xor x3 = r10, r11 C M I
133 shr.u n = n, 2 C I0
134 (p15) br.cond.dptk .grt1 C B
136 popcnt r8 = x3 C I0
137 br.ret.sptk.many b0 C B
139 .grt1: ld8 u0 = [up], 8 C M01
140 ld8 v0 = [vp], 8 C M01
141 mov.i ar.lc = n C I0
143 ld8 u1 = [up], 8 C M01
144 ld8 v1 = [vp], 8 C M01
145 mov s = 0 C M I
147 ld8 u2 = [up], 8 C M01
148 ld8 v2 = [vp], 8 C M01
150 ld8 u3 = [up], 8 C M01
151 ld8 v3 = [vp], 8 C M01
152 xor x0 = u0, v0 C M I
153 br.cloop.dpnt .grt5 C B
155 xor x1 = u1, v1 C M I
157 popcnt c3 = x3 C I0
158 xor x2 = u2, v2 C M I
160 popcnt c0 = x0 C I0
161 xor x3 = u3, v3 C M I
163 popcnt c1 = x1 C I0
164 br .Lcj5 C B
166 .grt5: ld8 u0 = [up], 8 C M01
167 ld8 v0 = [vp], 8 C M01
168 xor x1 = u1, v1 C M I
170 ld8 u1 = [up], 8 C M01
171 ld8 v1 = [vp], 8 C M01
172 popcnt c3 = x3 C I0
173 xor x2 = u2, v2 C M I
175 ld8 u2 = [up], 8 C M01
176 ld8 v2 = [vp], 8 C M01
177 popcnt c0 = x0 C I0
178 xor x3 = u3, v3 C M I
180 ld8 u3 = [up], 8 C M01
181 ld8 v3 = [vp], 8 C M01
182 popcnt c1 = x1 C I0
183 xor x0 = u0, v0 C M I
184 br.cloop.dpnt .Loop C B
185 br .Lend C B
188 .Lb10: ld8 u3 = [up], 8 C M01
189 ld8 v3 = [vp], 8 C M01
190 xor x2 = r10, r11 C M I
191 (p15) br.cond.dptk .grt2 C B
193 xor x3 = u3, v3 C M I
195 popcnt c2 = x2 C I0
197 popcnt c3 = x3 C I0
199 add s = c2, c3 C M I
200 br.ret.sptk.many b0 C B
202 .grt2: ld8 u0 = [up], 8 C M01
203 ld8 v0 = [vp], 8 C M01
204 shr.u n = n, 2 C I0
206 ld8 u1 = [up], 8 C M01
207 ld8 v1 = [vp], 8 C M01
208 mov.i ar.lc = n C I0
209 mov s = 0 C M I
211 ld8 u2 = [up], 8 C M01
212 ld8 v2 = [vp], 8 C M01
213 xor x3 = u3, v3 C M I
215 ld8 u3 = [up], 8 C M01
216 ld8 v3 = [vp], 8 C M01
217 xor x0 = u0, v0 C M I
218 br.cloop.dptk .grt6 C B
220 popcnt c2 = x2 C I0
221 xor x1 = u1, v1 C M I
223 popcnt c3 = x3 C I0
224 xor x2 = u2, v2 C M I
226 popcnt c0 = x0 C I0
227 xor x3 = u3, v3 C M I
228 br .Lcj6 C B
230 .grt6: ld8 u0 = [up], 8 C M01
231 ld8 v0 = [vp], 8 C M01
232 popcnt c2 = x2 C I0
233 xor x1 = u1, v1 C M I
235 ld8 u1 = [up], 8 C M01
236 ld8 v1 = [vp], 8 C M01
237 popcnt c3 = x3 C I0
238 xor x2 = u2, v2 C M I
240 ld8 u2 = [up], 8 C M01
241 ld8 v2 = [vp], 8 C M01
242 popcnt c0 = x0 C I0
243 xor x3 = u3, v3 C M I
244 br .LL10 C B
247 .Lb11: ld8 u2 = [up], 8 C M01
248 ld8 v2 = [vp], 8 C M01
249 shr.u n = n, 2 C I0
250 xor x1 = r10, r11 C M I
252 ld8 u3 = [up], 8 C M01
253 ld8 v3 = [vp], 8 C M01
254 xor x2 = u2, v2 C M I
255 (p15) br.cond.dptk .grt3 C B
257 xor x3 = u3, v3 C M I
259 popcnt c1 = x1 C I0
261 popcnt c2 = x2 C I0
263 popcnt c3 = x3 C I0
265 add s = c1, c2 C M I
267 add s = s, c3 C M I
268 br.ret.sptk.many b0 C B
270 .grt3: ld8 u0 = [up], 8 C M01
271 ld8 v0 = [vp], 8 C M01
272 mov.i ar.lc = n C I0
274 ld8 u1 = [up], 8 C M01
275 ld8 v1 = [vp], 8 C M01
276 mov s = 0 C M I
278 ld8 u2 = [up], 8 C M01
279 ld8 v2 = [vp], 8 C M01
280 xor x3 = u3, v3 C M I
282 ld8 u3 = [up], 8 C M01
283 ld8 v3 = [vp], 8 C M01
284 popcnt c1 = x1 C I0
285 xor x0 = u0, v0 C M I
286 br.cloop.dptk .grt7 C B
287 popcnt c2 = x2 C I0
288 xor x1 = u1, v1 C M I
290 popcnt c3 = x3 C I0
291 xor x2 = u2, v2 C M I
292 br .Lcj7 C B
294 .grt7: ld8 u0 = [up], 8 C M01
295 ld8 v0 = [vp], 8 C M01
296 popcnt c2 = x2 C I0
297 xor x1 = u1, v1 C M I
299 ld8 u1 = [up], 8 C M01
300 ld8 v1 = [vp], 8 C M01
301 popcnt c3 = x3 C I0
302 xor x2 = u2, v2 C M I
303 br .LL11 C B
306 ALIGN(32)
307 .Loop: ld8 u0 = [up], 8 C M01
308 ld8 v0 = [vp], 8 C M01
309 popcnt c2 = x2 C I0
310 add s = s, c3 C M I
311 xor x1 = u1, v1 C M I
312 nop.b 1 C -
314 .LL00: ld8 u1 = [up], 8 C M01
315 ld8 v1 = [vp], 8 C M01
316 popcnt c3 = x3 C I0
317 add s = s, c0 C M I
318 xor x2 = u2, v2 C M I
319 nop.b 1 C -
321 .LL11: ld8 u2 = [up], 8 C M01
322 ld8 v2 = [vp], 8 C M01
323 popcnt c0 = x0 C I0
324 add s = s, c1 C M I
325 xor x3 = u3, v3 C M I
326 nop.b 1 C -
328 .LL10: ld8 u3 = [up], 8 C M01
329 ld8 v3 = [vp], 8 C M01
330 popcnt c1 = x1 C I0
331 add s = s, c2 C M I
332 xor x0 = u0, v0 C M I
333 br.cloop.dptk .Loop C B
336 .Lend: popcnt c2 = x2 C I0
337 add s = s, c3 C M I
338 xor x1 = u1, v1 C M I
340 .Lcj8: popcnt c3 = x3 C I0
341 add s = s, c0 C M I
342 xor x2 = u2, v2 C M I
344 .Lcj7: popcnt c0 = x0 C I0
345 add s = s, c1 C M I
346 xor x3 = u3, v3 C M I
348 .Lcj6: popcnt c1 = x1 C I0
349 add s = s, c2 C M I
351 .Lcj5: popcnt c2 = x2 C I0
352 add s = s, c3 C M I
354 .Lcj4: popcnt c3 = x3 C I0
355 add s = s, c0 C M I
357 add s = s, c1 C M I
359 add s = s, c2 C M I
361 add s = s, c3 C M I
362 mov.i ar.lc = r2 C I0
363 br.ret.sptk.many b0 C B
364 EPILOGUE()
365 ASM_END()