beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / arm / neon / popcount.asm
blob2f8f9afc8d6fdba04149f7aa2f55e554dad9adc8
1 dnl ARM Neon mpn_popcount -- mpn bit population count.
3 dnl Copyright 2013 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C StrongARM: -
35 C XScale -
36 C Cortex-A7 ?
37 C Cortex-A8 ?
38 C Cortex-A9 1.125
39 C Cortex-A15 0.56
41 C TODO
42 C * Explore using vldr and vldm. Does it help on A9? (These loads do
43 C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for
44 C popcount. Except perhaps also for popcount for the edge loads.)
45 C * Arrange to align the pointer, if that helps performance. Use the same
46 C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
47 C valgrind!)
48 C * Explore if explicit align directives, e.g., "[ptr:128]" help.
49 C * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
51 C INPUT PARAMETERS
52 define(`ap', r0)
53 define(`n', r1)
55 C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
56 C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
57 C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which
58 C can be represented as a 8-bit ARM constant.
60 define(`chunksize',0x3f80)
62 ASM_START()
63 PROLOGUE(mpn_popcount)
65 cmp n, #chunksize
66 bhi L(gt16k)
68 L(lt16k):
69 vmov.i64 q8, #0 C clear summation register
70 vmov.i64 q9, #0 C clear summation register
72 tst n, #1
73 beq L(xxx0)
74 vmov.i64 d0, #0
75 sub n, n, #1
76 vld1.32 {d0[0]}, [ap]! C load 1 limb
77 vcnt.8 d24, d0
78 vpadal.u8 d16, d24 C d16/q8 = 0; could just splat
80 L(xxx0):tst n, #2
81 beq L(xx00)
82 sub n, n, #2
83 vld1.32 {d0}, [ap]! C load 2 limbs
84 vcnt.8 d24, d0
85 vpadal.u8 d16, d24
87 L(xx00):tst n, #4
88 beq L(x000)
89 sub n, n, #4
90 vld1.32 {q0}, [ap]! C load 4 limbs
91 vcnt.8 q12, q0
92 vpadal.u8 q8, q12
94 L(x000):tst n, #8
95 beq L(0000)
97 subs n, n, #8
98 vld1.32 {q0,q1}, [ap]! C load 8 limbs
99 bls L(sum)
101 L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs
102 sub n, n, #8
103 vcnt.8 q12, q0
104 vcnt.8 q13, q1
105 b L(mid)
107 L(0000):subs n, n, #16
108 blo L(e0)
110 vld1.32 {q2,q3}, [ap]! C load 8 limbs
111 vld1.32 {q0,q1}, [ap]! C load 8 limbs
112 vcnt.8 q12, q2
113 vcnt.8 q13, q3
114 subs n, n, #16
115 blo L(end)
117 L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs
118 vpadal.u8 q8, q12
119 vcnt.8 q12, q0
120 vpadal.u8 q9, q13
121 vcnt.8 q13, q1
122 L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs
123 subs n, n, #16
124 vpadal.u8 q8, q12
125 vcnt.8 q12, q2
126 vpadal.u8 q9, q13
127 vcnt.8 q13, q3
128 bhs L(top)
130 L(end): vpadal.u8 q8, q12
131 vpadal.u8 q9, q13
132 L(sum): vcnt.8 q12, q0
133 vcnt.8 q13, q1
134 vpadal.u8 q8, q12
135 vpadal.u8 q9, q13
136 vadd.i16 q8, q8, q9
137 C we have 8 16-bit counts
138 L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts
139 vpaddl.u32 q8, q8 C we have 2 64-bit counts
140 vmov.32 r0, d16[0]
141 vmov.32 r1, d17[0]
142 add r0, r0, r1
143 bx lr
145 C Code for large count. Splits operand and calls above code.
146 define(`ap2', r2) C caller-saves reg not used above
147 L(gt16k):
148 push {r4,r14}
149 mov ap2, ap
150 mov r3, n C full count
151 mov r4, #0 C total sum
153 1: mov n, #chunksize C count for this invocation
154 bl L(lt16k) C could jump deep inside code
155 add ap2, ap2, #chunksize*4 C point at next chunk
156 add r4, r4, r0
157 mov ap, ap2 C put chunk pointer in place for call
158 sub r3, r3, #chunksize
159 cmp r3, #chunksize
160 bhi 1b
162 mov n, r3 C count for final invocation
163 bl L(lt16k)
164 add r0, r4, r0
165 pop {r4,pc}
166 EPILOGUE()