2 * Simple C functions to supplement the C library
4 * Copyright (c) 2006 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include "qemu/osdep.h"
25 #include "qemu/cutils.h"
26 #include "qemu/bswap.h"
27 #include "host/cpuinfo.h"
29 typedef bool (*biz_accel_fn
)(const void *, size_t);
31 static bool buffer_is_zero_int_lt256(const void *buf
, size_t len
)
34 const uint64_t *p
, *e
;
37 * Use unaligned memory access functions to handle
38 * the beginning and end of the buffer.
40 if (unlikely(len
<= 8)) {
41 return (ldl_he_p(buf
) | ldl_he_p(buf
+ len
- 4)) == 0;
44 t
= ldq_he_p(buf
) | ldq_he_p(buf
+ len
- 8);
45 p
= QEMU_ALIGN_PTR_DOWN(buf
+ 8, 8);
46 e
= QEMU_ALIGN_PTR_DOWN(buf
+ len
- 1, 8);
48 /* Read 0 to 31 aligned words from the middle. */
55 static bool buffer_is_zero_int_ge256(const void *buf
, size_t len
)
58 * Use unaligned memory access functions to handle
59 * the beginning and end of the buffer.
61 uint64_t t
= ldq_he_p(buf
) | ldq_he_p(buf
+ len
- 8);
62 const uint64_t *p
= QEMU_ALIGN_PTR_DOWN(buf
+ 8, 8);
63 const uint64_t *e
= QEMU_ALIGN_PTR_DOWN(buf
+ len
- 1, 8);
65 /* Collect a partial block at the tail end. */
66 t
|= e
[-7] | e
[-6] | e
[-5] | e
[-4] | e
[-3] | e
[-2] | e
[-1];
69 * Loop over 64 byte blocks.
70 * With the head and tail removed, e - p >= 30,
71 * so the loop must iterate at least 3 times.
77 t
= p
[0] | p
[1] | p
[2] | p
[3] | p
[4] | p
[5] | p
[6] | p
[7];
84 #if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
85 #include <immintrin.h>
87 /* Helper for preventing the compiler from reassociating
88 chains of binary vector operations. */
89 #define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1))
91 /* Note that these vectorized functions may assume len >= 256. */
93 static bool __attribute__((target("sse2")))
94 buffer_zero_sse2(const void *buf
, size_t len
)
96 /* Unaligned loads at head/tail. */
97 __m128i v
= *(__m128i_u
*)(buf
);
98 __m128i w
= *(__m128i_u
*)(buf
+ len
- 16);
99 /* Align head/tail to 16-byte boundaries. */
100 const __m128i
*p
= QEMU_ALIGN_PTR_DOWN(buf
+ 16, 16);
101 const __m128i
*e
= QEMU_ALIGN_PTR_DOWN(buf
+ len
- 1, 16);
102 __m128i zero
= { 0 };
104 /* Collect a partial block at tail end. */
105 v
|= e
[-1]; w
|= e
[-2];
106 SSE_REASSOC_BARRIER(v
, w
);
107 v
|= e
[-3]; w
|= e
[-4];
108 SSE_REASSOC_BARRIER(v
, w
);
109 v
|= e
[-5]; w
|= e
[-6];
110 SSE_REASSOC_BARRIER(v
, w
);
114 * Loop over complete 128-byte blocks.
115 * With the head and tail removed, e - p >= 14, so the loop
116 * must iterate at least once.
119 v
= _mm_cmpeq_epi8(v
, zero
);
120 if (unlikely(_mm_movemask_epi8(v
) != 0xFFFF)) {
124 SSE_REASSOC_BARRIER(v
, w
);
125 v
|= p
[2]; w
|= p
[3];
126 SSE_REASSOC_BARRIER(v
, w
);
127 v
|= p
[4]; w
|= p
[5];
128 SSE_REASSOC_BARRIER(v
, w
);
129 v
|= p
[6]; w
|= p
[7];
130 SSE_REASSOC_BARRIER(v
, w
);
135 return _mm_movemask_epi8(_mm_cmpeq_epi8(v
, zero
)) == 0xFFFF;
138 #ifdef CONFIG_AVX2_OPT
139 static bool __attribute__((target("avx2")))
140 buffer_zero_avx2(const void *buf
, size_t len
)
142 /* Unaligned loads at head/tail. */
143 __m256i v
= *(__m256i_u
*)(buf
);
144 __m256i w
= *(__m256i_u
*)(buf
+ len
- 32);
145 /* Align head/tail to 32-byte boundaries. */
146 const __m256i
*p
= QEMU_ALIGN_PTR_DOWN(buf
+ 32, 32);
147 const __m256i
*e
= QEMU_ALIGN_PTR_DOWN(buf
+ len
- 1, 32);
148 __m256i zero
= { 0 };
150 /* Collect a partial block at tail end. */
151 v
|= e
[-1]; w
|= e
[-2];
152 SSE_REASSOC_BARRIER(v
, w
);
153 v
|= e
[-3]; w
|= e
[-4];
154 SSE_REASSOC_BARRIER(v
, w
);
155 v
|= e
[-5]; w
|= e
[-6];
156 SSE_REASSOC_BARRIER(v
, w
);
159 /* Loop over complete 256-byte blocks. */
160 for (; p
< e
- 7; p
+= 8) {
161 /* PTEST is not profitable here. */
162 v
= _mm256_cmpeq_epi8(v
, zero
);
163 if (unlikely(_mm256_movemask_epi8(v
) != 0xFFFFFFFF)) {
167 SSE_REASSOC_BARRIER(v
, w
);
168 v
|= p
[2]; w
|= p
[3];
169 SSE_REASSOC_BARRIER(v
, w
);
170 v
|= p
[4]; w
|= p
[5];
171 SSE_REASSOC_BARRIER(v
, w
);
172 v
|= p
[6]; w
|= p
[7];
173 SSE_REASSOC_BARRIER(v
, w
);
177 return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v
, zero
)) == 0xFFFFFFFF;
179 #endif /* CONFIG_AVX2_OPT */
181 static biz_accel_fn
const accel_table
[] = {
182 buffer_is_zero_int_ge256
,
184 #ifdef CONFIG_AVX2_OPT
189 static unsigned best_accel(void)
191 unsigned info
= cpuinfo_init();
193 #ifdef CONFIG_AVX2_OPT
194 if (info
& CPUINFO_AVX2
) {
198 return info
& CPUINFO_SSE2
? 1 : 0;
201 #elif defined(__aarch64__) && defined(__ARM_NEON)
202 #include <arm_neon.h>
205 * Helper for preventing the compiler from reassociating
206 * chains of binary vector operations.
208 #define REASSOC_BARRIER(vec0, vec1) asm("" : "+w"(vec0), "+w"(vec1))
210 static bool buffer_is_zero_simd(const void *buf
, size_t len
)
212 uint32x4_t t0
, t1
, t2
, t3
;
214 /* Align head/tail to 16-byte boundaries. */
215 const uint32x4_t
*p
= QEMU_ALIGN_PTR_DOWN(buf
+ 16, 16);
216 const uint32x4_t
*e
= QEMU_ALIGN_PTR_DOWN(buf
+ len
- 1, 16);
218 /* Unaligned loads at head/tail. */
219 t0
= vld1q_u32(buf
) | vld1q_u32(buf
+ len
- 16);
221 /* Collect a partial block at tail end. */
226 REASSOC_BARRIER(t0
, t1
);
227 REASSOC_BARRIER(t2
, t3
);
230 REASSOC_BARRIER(t0
, t2
);
234 * Loop over complete 128-byte blocks.
235 * With the head and tail removed, e - p >= 14, so the loop
236 * must iterate at least once.
240 * Reduce via UMAXV. Whatever the actual result,
241 * it will only be zero if all input bytes are zero.
243 if (unlikely(vmaxvq_u32(t0
) != 0)) {
251 REASSOC_BARRIER(t0
, t1
);
252 REASSOC_BARRIER(t2
, t3
);
255 REASSOC_BARRIER(t0
, t2
);
260 return vmaxvq_u32(t0
) == 0;
263 #define best_accel() 1
264 static biz_accel_fn
const accel_table
[] = {
265 buffer_is_zero_int_ge256
,
269 #define best_accel() 0
270 static biz_accel_fn
const accel_table
[1] = {
271 buffer_is_zero_int_ge256
275 static biz_accel_fn buffer_is_zero_accel
;
276 static unsigned accel_index
;
278 bool buffer_is_zero_ool(const void *buf
, size_t len
)
280 if (unlikely(len
== 0)) {
283 if (!buffer_is_zero_sample3(buf
, len
)) {
286 /* All bytes are covered for any len <= 3. */
287 if (unlikely(len
<= 3)) {
291 if (likely(len
>= 256)) {
292 return buffer_is_zero_accel(buf
, len
);
294 return buffer_is_zero_int_lt256(buf
, len
);
297 bool buffer_is_zero_ge256(const void *buf
, size_t len
)
299 return buffer_is_zero_accel(buf
, len
);
302 bool test_buffer_is_zero_next_accel(void)
304 if (accel_index
!= 0) {
305 buffer_is_zero_accel
= accel_table
[--accel_index
];
311 static void __attribute__((constructor
)) init_accel(void)
313 accel_index
= best_accel();
314 buffer_is_zero_accel
= accel_table
[accel_index
];