1 dnl ARM Neon mpn_popcount
-- mpn bit population count.
3 dnl Copyright
2013 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
42 C * Explore using vldr and vldm. Does it help on A9? (These loads do
43 C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for
44 C popcount. Except perhaps also for popcount for the edge loads.)
45 C * Arrange to align the pointer, if that helps performance. Use the same
46 C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
48 C * Explore if explicit align directives, e.g., "[ptr:128]" help.
49 C * See rth's gmp
-devel
2013-02/03 messages about final summation tricks.
55 C We sum
into 16 16-bit counters
in q8
,q9
, but at the
end we sum them
and end
56 C up with
8 16-bit counters. Therefore
, we can sum to
8(2^
16-1) bits
, or
57 C
(8*2^
16-1)/32 = 0x3fff limbs. We use a chunksize close to that
, but which
58 C can be represented as a
8-bit ARM constant.
60 define
(`chunksize
',0x3f80)
63 PROLOGUE(mpn_popcount)
69 vmov.i64 q8, #0 C clear summation register
70 vmov.i64 q9, #0 C clear summation register
76 vld1.32 {d0[0]}, [ap]! C load 1 limb
78 vpadal.u8 d16, d24 C d16/q8 = 0; could just splat
83 vld1.32 {d0}, [ap]! C load 2 limbs
90 vld1.32 {q0}, [ap]! C load 4 limbs
98 vld1.32 {q0,q1}, [ap]! C load 8 limbs
101 L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs
107 L(0000):subs n, n, #16
110 vld1.32 {q2,q3}, [ap]! C load 8 limbs
111 vld1.32 {q0,q1}, [ap]! C load 8 limbs
117 L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs
122 L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs
130 L(end): vpadal.u8 q8, q12
132 L(sum): vcnt.8 q12, q0
137 C we have 8 16-bit counts
138 L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts
139 vpaddl.u32 q8, q8 C we have 2 64-bit counts
145 C Code for large count. Splits operand and calls above code.
146 define(`ap2', r2
) C caller
-saves reg
not used above
150 mov r3
, n C full count
151 mov r4
, #
0 C total sum
153 1: mov n
, #chunksize C count for
this invocation
154 bl L
(lt16k
) C could jump deep inside code
155 add ap2
, ap2
, #chunksize
*4 C point at next chunk
157 mov ap
, ap2 C put chunk pointer
in place for
call
158 sub r3
, r3
, #chunksize
162 mov n
, r3 C count for final invocation