beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / arm64 / hamdist.asm
blobd298b951cf6cf73b67c4d22a5aef3a4db1393400
1 dnl ARM64 Neon mpn_hamdist -- mpn bit hamming distance.
3 dnl Copyright 2013, 2014 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb
34 C Cortex-A53 ?
35 C Cortex-A57 ?
37 C TODO
38 C * Consider greater unrolling.
39 C * Arrange to align the pointer, if that helps performance. Use the same
40 C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
41 C valgrind!)
42 C * Explore if explicit align directives, e.g., "[ptr:128]" help.
43 C * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
45 changecom(@&*$)
47 C INPUT PARAMETERS
48 define(`ap', x0)
49 define(`bp', x1)
50 define(`n', x2)
52 C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
53 C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
54 C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which
55 C allows the huge count code to jump deep into the code (at L(chu)).
57 define(`maxsize', 0x1fff)
58 define(`chunksize',0x1ff0)
60 ASM_START()
61 PROLOGUE(mpn_hamdist)
63 mov x11, #maxsize
64 cmp n, x11
65 b.hi L(gt8k)
67 L(lt8k):
68 movi v4.16b, #0 C clear summation register
69 movi v5.16b, #0 C clear summation register
71 tbz n, #0, L(xx0)
72 sub n, n, #1
73 ld1 {v0.1d}, [ap], #8 C load 1 limb
74 ld1 {v16.1d}, [bp], #8 C load 1 limb
75 eor v0.16b, v0.16b, v16.16b
76 cnt v6.16b, v0.16b
77 uadalp v4.8h, v6.16b C could also splat
79 L(xx0): tbz n, #1, L(x00)
80 sub n, n, #2
81 ld1 {v0.2d}, [ap], #16 C load 2 limbs
82 ld1 {v16.2d}, [bp], #16 C load 2 limbs
83 eor v0.16b, v0.16b, v16.16b
84 cnt v6.16b, v0.16b
85 uadalp v4.8h, v6.16b
87 L(x00): tbz n, #2, L(000)
88 subs n, n, #4
89 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
90 ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
91 b.ls L(sum)
93 L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
94 ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
95 eor v0.16b, v0.16b, v16.16b
96 eor v1.16b, v1.16b, v17.16b
97 sub n, n, #4
98 cnt v6.16b, v0.16b
99 cnt v7.16b, v1.16b
100 b L(mid)
102 L(000): subs n, n, #8
103 b.lo L(e0)
105 L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
106 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
107 ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
108 ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
109 eor v2.16b, v2.16b, v18.16b
110 eor v3.16b, v3.16b, v19.16b
111 cnt v6.16b, v2.16b
112 cnt v7.16b, v3.16b
113 subs n, n, #8
114 b.lo L(end)
116 L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
117 ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
118 eor v0.16b, v0.16b, v16.16b
119 eor v1.16b, v1.16b, v17.16b
120 uadalp v4.8h, v6.16b
121 cnt v6.16b, v0.16b
122 uadalp v5.8h, v7.16b
123 cnt v7.16b, v1.16b
124 L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
125 ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
126 eor v2.16b, v2.16b, v18.16b
127 eor v3.16b, v3.16b, v19.16b
128 subs n, n, #8
129 uadalp v4.8h, v6.16b
130 cnt v6.16b, v2.16b
131 uadalp v5.8h, v7.16b
132 cnt v7.16b, v3.16b
133 b.hs L(top)
135 L(end): uadalp v4.8h, v6.16b
136 uadalp v5.8h, v7.16b
137 L(sum): eor v0.16b, v0.16b, v16.16b
138 eor v1.16b, v1.16b, v17.16b
139 cnt v6.16b, v0.16b
140 cnt v7.16b, v1.16b
141 uadalp v4.8h, v6.16b
142 uadalp v5.8h, v7.16b
143 add v4.8h, v4.8h, v5.8h
144 C we have 8 16-bit counts
145 L(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts
146 uaddlp v4.2d, v4.4s C we have 2 64-bit counts
147 mov x0, v4.d[0]
148 mov x1, v4.d[1]
149 add x0, x0, x1
152 C Code for count > maxsize. Splits operand and calls above code.
153 define(`ap2', x5) C caller-saves reg not used above
154 define(`bp2', x6) C caller-saves reg not used above
155 L(gt8k):
156 mov x8, x30
157 mov x7, n C full count (caller-saves reg not used above)
158 mov x4, #0 C total sum (caller-saves reg not used above)
159 mov x9, #chunksize*8 C caller-saves reg not used above
160 mov x10, #chunksize C caller-saves reg not used above
162 1: add ap2, ap, x9 C point at subsequent block
163 add bp2, bp, x9 C point at subsequent block
164 mov n, #chunksize-8 C count for this invocation, adjusted for entry pt
165 movi v4.16b, #0 C clear chunk summation register
166 movi v5.16b, #0 C clear chunk summation register
167 bl L(chu) C jump deep inside code
168 add x4, x4, x0
169 mov ap, ap2 C put chunk pointer in place for calls
170 mov bp, bp2 C put chunk pointer in place for calls
171 sub x7, x7, x10
172 cmp x7, x11
173 b.hi 1b
175 mov n, x7 C count for final invocation
176 bl L(lt8k)
177 add x0, x4, x0
178 mov x30, x8
180 EPILOGUE()