qapi: Move conditional code from QAPISchemaVariants to its subtypes
[qemu/kevin.git] / util / bufferiszero.c
blob74864f7b78256b0d3dbb29e4df169c610131d3e3
1 /*
2 * Simple C functions to supplement the C library
4 * Copyright (c) 2006 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
24 #include "qemu/osdep.h"
25 #include "qemu/cutils.h"
26 #include "qemu/bswap.h"
27 #include "host/cpuinfo.h"
29 typedef bool (*biz_accel_fn)(const void *, size_t);
31 static bool buffer_is_zero_int_lt256(const void *buf, size_t len)
33 uint64_t t;
34 const uint64_t *p, *e;
37 * Use unaligned memory access functions to handle
38 * the beginning and end of the buffer.
40 if (unlikely(len <= 8)) {
41 return (ldl_he_p(buf) | ldl_he_p(buf + len - 4)) == 0;
44 t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
45 p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8);
46 e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8);
48 /* Read 0 to 31 aligned words from the middle. */
49 while (p < e) {
50 t |= *p++;
52 return t == 0;
55 static bool buffer_is_zero_int_ge256(const void *buf, size_t len)
58 * Use unaligned memory access functions to handle
59 * the beginning and end of the buffer.
61 uint64_t t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
62 const uint64_t *p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8);
63 const uint64_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8);
65 /* Collect a partial block at the tail end. */
66 t |= e[-7] | e[-6] | e[-5] | e[-4] | e[-3] | e[-2] | e[-1];
69 * Loop over 64 byte blocks.
70 * With the head and tail removed, e - p >= 30,
71 * so the loop must iterate at least 3 times.
73 do {
74 if (t) {
75 return false;
77 t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
78 p += 8;
79 } while (p < e - 7);
81 return t == 0;
84 #if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
85 #include <immintrin.h>
87 /* Helper for preventing the compiler from reassociating
88 chains of binary vector operations. */
89 #define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1))
91 /* Note that these vectorized functions may assume len >= 256. */
93 static bool __attribute__((target("sse2")))
94 buffer_zero_sse2(const void *buf, size_t len)
96 /* Unaligned loads at head/tail. */
97 __m128i v = *(__m128i_u *)(buf);
98 __m128i w = *(__m128i_u *)(buf + len - 16);
99 /* Align head/tail to 16-byte boundaries. */
100 const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
101 const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
102 __m128i zero = { 0 };
104 /* Collect a partial block at tail end. */
105 v |= e[-1]; w |= e[-2];
106 SSE_REASSOC_BARRIER(v, w);
107 v |= e[-3]; w |= e[-4];
108 SSE_REASSOC_BARRIER(v, w);
109 v |= e[-5]; w |= e[-6];
110 SSE_REASSOC_BARRIER(v, w);
111 v |= e[-7]; v |= w;
114 * Loop over complete 128-byte blocks.
115 * With the head and tail removed, e - p >= 14, so the loop
116 * must iterate at least once.
118 do {
119 v = _mm_cmpeq_epi8(v, zero);
120 if (unlikely(_mm_movemask_epi8(v) != 0xFFFF)) {
121 return false;
123 v = p[0]; w = p[1];
124 SSE_REASSOC_BARRIER(v, w);
125 v |= p[2]; w |= p[3];
126 SSE_REASSOC_BARRIER(v, w);
127 v |= p[4]; w |= p[5];
128 SSE_REASSOC_BARRIER(v, w);
129 v |= p[6]; w |= p[7];
130 SSE_REASSOC_BARRIER(v, w);
131 v |= w;
132 p += 8;
133 } while (p < e - 7);
135 return _mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) == 0xFFFF;
138 #ifdef CONFIG_AVX2_OPT
139 static bool __attribute__((target("avx2")))
140 buffer_zero_avx2(const void *buf, size_t len)
142 /* Unaligned loads at head/tail. */
143 __m256i v = *(__m256i_u *)(buf);
144 __m256i w = *(__m256i_u *)(buf + len - 32);
145 /* Align head/tail to 32-byte boundaries. */
146 const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32);
147 const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32);
148 __m256i zero = { 0 };
150 /* Collect a partial block at tail end. */
151 v |= e[-1]; w |= e[-2];
152 SSE_REASSOC_BARRIER(v, w);
153 v |= e[-3]; w |= e[-4];
154 SSE_REASSOC_BARRIER(v, w);
155 v |= e[-5]; w |= e[-6];
156 SSE_REASSOC_BARRIER(v, w);
157 v |= e[-7]; v |= w;
159 /* Loop over complete 256-byte blocks. */
160 for (; p < e - 7; p += 8) {
161 /* PTEST is not profitable here. */
162 v = _mm256_cmpeq_epi8(v, zero);
163 if (unlikely(_mm256_movemask_epi8(v) != 0xFFFFFFFF)) {
164 return false;
166 v = p[0]; w = p[1];
167 SSE_REASSOC_BARRIER(v, w);
168 v |= p[2]; w |= p[3];
169 SSE_REASSOC_BARRIER(v, w);
170 v |= p[4]; w |= p[5];
171 SSE_REASSOC_BARRIER(v, w);
172 v |= p[6]; w |= p[7];
173 SSE_REASSOC_BARRIER(v, w);
174 v |= w;
177 return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, zero)) == 0xFFFFFFFF;
179 #endif /* CONFIG_AVX2_OPT */
181 static biz_accel_fn const accel_table[] = {
182 buffer_is_zero_int_ge256,
183 buffer_zero_sse2,
184 #ifdef CONFIG_AVX2_OPT
185 buffer_zero_avx2,
186 #endif
189 static unsigned best_accel(void)
191 unsigned info = cpuinfo_init();
193 #ifdef CONFIG_AVX2_OPT
194 if (info & CPUINFO_AVX2) {
195 return 2;
197 #endif
198 return info & CPUINFO_SSE2 ? 1 : 0;
201 #elif defined(__aarch64__) && defined(__ARM_NEON)
202 #include <arm_neon.h>
205 * Helper for preventing the compiler from reassociating
206 * chains of binary vector operations.
208 #define REASSOC_BARRIER(vec0, vec1) asm("" : "+w"(vec0), "+w"(vec1))
210 static bool buffer_is_zero_simd(const void *buf, size_t len)
212 uint32x4_t t0, t1, t2, t3;
214 /* Align head/tail to 16-byte boundaries. */
215 const uint32x4_t *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
216 const uint32x4_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
218 /* Unaligned loads at head/tail. */
219 t0 = vld1q_u32(buf) | vld1q_u32(buf + len - 16);
221 /* Collect a partial block at tail end. */
222 t1 = e[-7] | e[-6];
223 t2 = e[-5] | e[-4];
224 t3 = e[-3] | e[-2];
225 t0 |= e[-1];
226 REASSOC_BARRIER(t0, t1);
227 REASSOC_BARRIER(t2, t3);
228 t0 |= t1;
229 t2 |= t3;
230 REASSOC_BARRIER(t0, t2);
231 t0 |= t2;
234 * Loop over complete 128-byte blocks.
235 * With the head and tail removed, e - p >= 14, so the loop
236 * must iterate at least once.
238 do {
240 * Reduce via UMAXV. Whatever the actual result,
241 * it will only be zero if all input bytes are zero.
243 if (unlikely(vmaxvq_u32(t0) != 0)) {
244 return false;
247 t0 = p[0] | p[1];
248 t1 = p[2] | p[3];
249 t2 = p[4] | p[5];
250 t3 = p[6] | p[7];
251 REASSOC_BARRIER(t0, t1);
252 REASSOC_BARRIER(t2, t3);
253 t0 |= t1;
254 t2 |= t3;
255 REASSOC_BARRIER(t0, t2);
256 t0 |= t2;
257 p += 8;
258 } while (p < e - 7);
260 return vmaxvq_u32(t0) == 0;
263 #define best_accel() 1
264 static biz_accel_fn const accel_table[] = {
265 buffer_is_zero_int_ge256,
266 buffer_is_zero_simd,
268 #else
269 #define best_accel() 0
270 static biz_accel_fn const accel_table[1] = {
271 buffer_is_zero_int_ge256
273 #endif
275 static biz_accel_fn buffer_is_zero_accel;
276 static unsigned accel_index;
278 bool buffer_is_zero_ool(const void *buf, size_t len)
280 if (unlikely(len == 0)) {
281 return true;
283 if (!buffer_is_zero_sample3(buf, len)) {
284 return false;
286 /* All bytes are covered for any len <= 3. */
287 if (unlikely(len <= 3)) {
288 return true;
291 if (likely(len >= 256)) {
292 return buffer_is_zero_accel(buf, len);
294 return buffer_is_zero_int_lt256(buf, len);
297 bool buffer_is_zero_ge256(const void *buf, size_t len)
299 return buffer_is_zero_accel(buf, len);
302 bool test_buffer_is_zero_next_accel(void)
304 if (accel_index != 0) {
305 buffer_is_zero_accel = accel_table[--accel_index];
306 return true;
308 return false;
311 static void __attribute__((constructor)) init_accel(void)
313 accel_index = best_accel();
314 buffer_is_zero_accel = accel_table[accel_index];