1 dnl ARM64 Neon mpn_popcount
-- mpn bit population count.
3 dnl Copyright
2013, 2014 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
38 C * Consider greater unrolling.
39 C * Arrange to align the pointer, if that helps performance. Use the same
40 C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
42 C * Explore if explicit align directives, e.g., "[ptr:128]" help.
43 C * See rth's gmp
-devel
2013-02/03 messages about final summation tricks.
51 C We sum
into 16 16-bit counters
in v4
,v5
, but at the
end we sum them
and end
52 C up with
8 16-bit counters. Therefore
, we can sum to
8(2^
16-1) bits
, or
53 C
(8*2^
16-1)/64 = 0x1fff limbs. We use a chunksize close to that
, but which
54 C allows the huge count code to jump deep
into the code
(at L
(chu
)).
56 define
(`maxsize
', 0x1fff)
57 define(`chunksize',0x1ff0)
60 PROLOGUE
(mpn_popcount
)
67 movi v4.16b
, #
0 C clear summation register
68 movi v5.16b
, #
0 C clear summation register
72 ld1
{v0.1d}, [ap
], #
8 C load
1 limb
74 uadalp v4.8h
, v6.16b C could also splat
76 L
(xx0
): tbz n
, #
1, L
(x00
)
78 ld1
{v0.2d}, [ap
], #
16 C load
2 limbs
82 L
(x00
): tbz n
, #
2, L
(000)
84 ld1
{v0.2d,v1.2d}, [ap
], #
32 C load
4 limbs
87 L
(gt4
): ld1
{v2.2d,v3.2d}, [ap
], #
32 C load
4 limbs
96 L
(chu
): ld1
{v2.2d,v3.2d}, [ap
], #
32 C load
4 limbs
97 ld1
{v0.2d,v1.2d}, [ap
], #
32 C load
4 limbs
103 L
(top
): ld1
{v2.2d,v3.2d}, [ap
], #
32 C load
4 limbs
108 L
(mid
): ld1
{v0.2d,v1.2d}, [ap
], #
32 C load
4 limbs
116 L
(end): uadalp v4.8h
, v6.16b
118 L
(sum
): cnt v6.16b
, v0.16b
122 add v4.8h
, v4.8h
, v5.8h
123 C we have
8 16-bit counts
124 L
(e0
): uaddlp v4.4s
, v4.8h C we have
4 32-bit counts
125 uaddlp v4.2d
, v4.4s C we have
2 64-bit counts
131 C Code for count
> maxsize. Splits operand
and calls above code.
132 define
(`ap2
', x5) C caller-saves reg not used above
135 mov x7, n C full count (caller-saves reg not used above)
136 mov x4, #0 C total sum (caller-saves reg not used above)
137 mov x9, #chunksize*8 C caller-saves reg not used above
138 mov x10, #chunksize C caller-saves reg not used above
140 1: add ap2, ap, x9 C point at subsequent block
141 mov n, #chunksize-8 C count for this invocation, adjusted for entry pt
142 movi v4.16b, #0 C clear chunk summation register
143 movi v5.16b, #0 C clear chunk summation register
144 bl L(chu) C jump deep inside code
146 mov ap, ap2 C put chunk pointer in place for calls
151 mov n, x7 C count for final invocation