1 dnl ARM64 Neon mpn_hamdist
-- mpn bit hamming distance.
3 dnl Copyright
2013, 2014 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
38 C * Consider greater unrolling.
39 C * Arrange to align the pointer, if that helps performance. Use the same
40 C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
42 C * Explore if explicit align directives, e.g., "[ptr:128]" help.
43 C * See rth's gmp
-devel
2013-02/03 messages about final summation tricks.
52 C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
53 C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
54 C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which
55 C allows the huge count code to jump deep into the code (at L(chu)).
57 define(`maxsize', 0x1fff)
58 define
(`chunksize
',0x1ff0)
68 movi v4.16b, #0 C clear summation register
69 movi v5.16b, #0 C clear summation register
73 ld1 {v0.1d}, [ap], #8 C load 1 limb
74 ld1 {v16.1d}, [bp], #8 C load 1 limb
75 eor v0.16b, v0.16b, v16.16b
77 uadalp v4.8h, v6.16b C could also splat
79 L(xx0): tbz n, #1, L(x00)
81 ld1 {v0.2d}, [ap], #16 C load 2 limbs
82 ld1 {v16.2d}, [bp], #16 C load 2 limbs
83 eor v0.16b, v0.16b, v16.16b
87 L(x00): tbz n, #2, L(000)
89 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
90 ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
93 L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
94 ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
95 eor v0.16b, v0.16b, v16.16b
96 eor v1.16b, v1.16b, v17.16b
102 L(000): subs n, n, #8
105 L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
106 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
107 ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
108 ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
109 eor v2.16b, v2.16b, v18.16b
110 eor v3.16b, v3.16b, v19.16b
116 L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
117 ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
118 eor v0.16b, v0.16b, v16.16b
119 eor v1.16b, v1.16b, v17.16b
124 L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
125 ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
126 eor v2.16b, v2.16b, v18.16b
127 eor v3.16b, v3.16b, v19.16b
135 L(end): uadalp v4.8h, v6.16b
137 L(sum): eor v0.16b, v0.16b, v16.16b
138 eor v1.16b, v1.16b, v17.16b
143 add v4.8h, v4.8h, v5.8h
144 C we have 8 16-bit counts
145 L(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts
146 uaddlp v4.2d, v4.4s C we have 2 64-bit counts
152 C Code for count > maxsize. Splits operand and calls above code.
153 define(`ap2', x5
) C caller
-saves reg
not used above
154 define
(`bp2
', x6) C caller-saves reg not used above
157 mov x7, n C full count (caller-saves reg not used above)
158 mov x4, #0 C total sum (caller-saves reg not used above)
159 mov x9, #chunksize*8 C caller-saves reg not used above
160 mov x10, #chunksize C caller-saves reg not used above
162 1: add ap2, ap, x9 C point at subsequent block
163 add bp2, bp, x9 C point at subsequent block
164 mov n, #chunksize-8 C count for this invocation, adjusted for entry pt
165 movi v4.16b, #0 C clear chunk summation register
166 movi v5.16b, #0 C clear chunk summation register
167 bl L(chu) C jump deep inside code
169 mov ap, ap2 C put chunk pointer in place for calls
170 mov bp, bp2 C put chunk pointer in place for calls
175 mov n, x7 C count for final invocation