beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / arm64 / popcount.asm
blob8c95c1fcc5e9cb9af0a7aefc85618c0c4d4a5440
1 dnl ARM64 Neon mpn_popcount -- mpn bit population count.
3 dnl Copyright 2013, 2014 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C Cortex-A53 ?
35 C Cortex-A57 ?
37 C TODO
38 C * Consider greater unrolling.
39 C * Arrange to align the pointer, if that helps performance. Use the same
40 C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
41 C valgrind!)
42 C * Explore if explicit align directives, e.g., "[ptr:128]" help.
43 C * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
45 changecom(@&*$)
47 C INPUT PARAMETERS
48 define(`ap', x0)
49 define(`n', x1)
51 C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
52 C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
53 C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which
54 C allows the huge count code to jump deep into the code (at L(chu)).
56 define(`maxsize', 0x1fff)
57 define(`chunksize',0x1ff0)
59 ASM_START()
60 PROLOGUE(mpn_popcount)
62 mov x11, #maxsize
63 cmp n, x11
64 b.hi L(gt8k)
66 L(lt8k):
67 movi v4.16b, #0 C clear summation register
68 movi v5.16b, #0 C clear summation register
70 tbz n, #0, L(xx0)
71 sub n, n, #1
72 ld1 {v0.1d}, [ap], #8 C load 1 limb
73 cnt v6.16b, v0.16b
74 uadalp v4.8h, v6.16b C could also splat
76 L(xx0): tbz n, #1, L(x00)
77 sub n, n, #2
78 ld1 {v0.2d}, [ap], #16 C load 2 limbs
79 cnt v6.16b, v0.16b
80 uadalp v4.8h, v6.16b
82 L(x00): tbz n, #2, L(000)
83 subs n, n, #4
84 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
85 b.ls L(sum)
87 L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
88 sub n, n, #4
89 cnt v6.16b, v0.16b
90 cnt v7.16b, v1.16b
91 b L(mid)
93 L(000): subs n, n, #8
94 b.lo L(e0)
96 L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
97 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
98 cnt v6.16b, v2.16b
99 cnt v7.16b, v3.16b
100 subs n, n, #8
101 b.lo L(end)
103 L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
104 uadalp v4.8h, v6.16b
105 cnt v6.16b, v0.16b
106 uadalp v5.8h, v7.16b
107 cnt v7.16b, v1.16b
108 L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
109 subs n, n, #8
110 uadalp v4.8h, v6.16b
111 cnt v6.16b, v2.16b
112 uadalp v5.8h, v7.16b
113 cnt v7.16b, v3.16b
114 b.hs L(top)
116 L(end): uadalp v4.8h, v6.16b
117 uadalp v5.8h, v7.16b
118 L(sum): cnt v6.16b, v0.16b
119 cnt v7.16b, v1.16b
120 uadalp v4.8h, v6.16b
121 uadalp v5.8h, v7.16b
122 add v4.8h, v4.8h, v5.8h
123 C we have 8 16-bit counts
124 L(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts
125 uaddlp v4.2d, v4.4s C we have 2 64-bit counts
126 mov x0, v4.d[0]
127 mov x1, v4.d[1]
128 add x0, x0, x1
131 C Code for count > maxsize. Splits operand and calls above code.
132 define(`ap2', x5) C caller-saves reg not used above
133 L(gt8k):
134 mov x8, x30
135 mov x7, n C full count (caller-saves reg not used above)
136 mov x4, #0 C total sum (caller-saves reg not used above)
137 mov x9, #chunksize*8 C caller-saves reg not used above
138 mov x10, #chunksize C caller-saves reg not used above
140 1: add ap2, ap, x9 C point at subsequent block
141 mov n, #chunksize-8 C count for this invocation, adjusted for entry pt
142 movi v4.16b, #0 C clear chunk summation register
143 movi v5.16b, #0 C clear chunk summation register
144 bl L(chu) C jump deep inside code
145 add x4, x4, x0
146 mov ap, ap2 C put chunk pointer in place for calls
147 sub x7, x7, x10
148 cmp x7, x11
149 b.hi 1b
151 mov n, x7 C count for final invocation
152 bl L(lt8k)
153 add x0, x4, x0
154 mov x30, x8
156 EPILOGUE()