1 dnl ARM Neon mpn_hamdist
-- mpn bit hamming distance.
3 dnl Copyright
2013 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
42 C * Explore using vldr and vldm. Does it help on A9? (These loads do
43 C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for
44 C popcount. Except perhaps also for popcount for the edge loads.)
45 C * Arrange to align the pointer, if that helps performance. Use the same
46 C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
48 C * Explore if explicit align directives, e.g., "[ptr:128]" help.
49 C * See rth's gmp
-devel
2013-02/03 messages about final summation tricks.
56 C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
57 C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
58 C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which
59 C can be represented as a 8-bit ARM constant.
61 define(`chunksize',0x3f80)
70 vmov.i64 q8
, #
0 C clear summation register
71 vmov.i64 q9
, #
0 C clear summation register
78 vld1.32
{d0[0]}, [ap
]! C load
1 limb
79 vld1.32
{d20[0]}, [bp]! C load
1 limb
82 vpadal.u8 d16
, d24 C d16
/q8
= 0; could just splat
87 vld1.32
{d0}, [ap
]! C load
2 limbs
88 vld1.32
{d20}, [bp]! C load
2 limbs
96 vld1.32
{q0}, [ap
]! C load
4 limbs
97 vld1.32
{q10}, [bp]! C load
4 limbs
106 vld1.32
{q0,q1}, [ap
]! C load
8 limbs
107 vld1.32
{q10,q11}, [bp]! C load
8 limbs
110 L
(gt8
): vld1.32
{q2,q3}, [ap
]! C load
8 limbs
111 vld1.32
{q14,q15}, [bp]! C load
8 limbs
119 L
(0000):subs n
, n
, #
16
122 vld1.32
{q2,q3}, [ap
]! C load
8 limbs
123 vld1.32
{q0,q1}, [ap
]! C load
8 limbs
124 vld1.32
{q14,q15}, [bp]! C load
8 limbs
125 vld1.32
{q10,q11}, [bp]! C load
8 limbs
133 L
(top
): vld1.32
{q2,q3}, [ap
]! C load
8 limbs
134 vld1.32
{q14,q15}, [bp]! C load
8 limbs
141 L
(mid
): vld1.32
{q0,q1}, [ap
]! C load
8 limbs
142 vld1.32
{q10,q11}, [bp]! C load
8 limbs
152 L
(end): vpadal.u8 q8
, q12
154 L
(sum
): veor q0
, q0
, q10
161 C we have
8 16-bit counts
162 L
(e0
): vpaddl.u16 q8
, q8 C we have
4 32-bit counts
163 vpaddl.u32 q8
, q8 C we have
2 64-bit counts
169 C Code for
large count. Splits operand
and calls above code.
176 mov r3
, n C full count
177 mov r4
, #
0 C total sum
179 1: mov n
, #chunksize C count for
this invocation
180 bl L
(lt16k
) C could jump deep inside code
181 add ap2
, ap2
, #chunksize
*4 C point at next chunk
182 add bp2
, bp2
, #chunksize
*4 C point at next chunk
184 mov ap
, ap2 C put chunk pointer
in place for
call
185 mov bp, bp2 C put chunk pointer
in place for
call
186 sub r3
, r3
, #chunksize
190 mov n
, r3 C count for final invocation