util/bufferiszero.c

   1 /*
   2  * Simple C functions to supplement the C library
   3  *
   4  * Copyright (c) 2006 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24 #include "qemu/osdep.h"
  25 #include "qemu/cutils.h"
  26 #include "qemu/bswap.h"
  27 #include "host/cpuinfo.h"
  28
  29 typedef bool (*biz_accel_fn)(const void *, size_t);
  30
  31 static bool buffer_is_zero_int_lt256(const void *buf, size_t len)
  32 {
  33     uint64_t t;
  34     const uint64_t *p, *e;
  35
  36     /*
  37      * Use unaligned memory access functions to handle
  38      * the beginning and end of the buffer.
  39      */
  40     if (unlikely(len <= 8)) {
  41         return (ldl_he_p(buf) | ldl_he_p(buf + len - 4)) == 0;
  42     }
  43
  44     t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
  45     p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8);
  46     e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8);
  47
  48     /* Read 0 to 31 aligned words from the middle. */
  49     while (p < e) {
  50         t |= *p++;
  51     }
  52     return t == 0;
  53 }
  54
  55 static bool buffer_is_zero_int_ge256(const void *buf, size_t len)
  56 {
  57     /*
  58      * Use unaligned memory access functions to handle
  59      * the beginning and end of the buffer.
  60      */
  61     uint64_t t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
  62     const uint64_t *p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8);
  63     const uint64_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8);
  64
  65     /* Collect a partial block at the tail end. */
  66     t |= e[-7] | e[-6] | e[-5] | e[-4] | e[-3] | e[-2] | e[-1];
  67
  68     /*
  69      * Loop over 64 byte blocks.
  70      * With the head and tail removed, e - p >= 30,
  71      * so the loop must iterate at least 3 times.
  72      */
  73     do {
  74         if (t) {
  75             return false;
  76         }
  77         t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
  78         p += 8;
  79     } while (p < e - 7);
  80
  81     return t == 0;
  82 }
  83
  84 #if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
  85 #include <immintrin.h>
  86
  87 /* Helper for preventing the compiler from reassociating
  88    chains of binary vector operations.  */
  89 #define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1))
  90
  91 /* Note that these vectorized functions may assume len >= 256.  */
  92
  93 static bool __attribute__((target("sse2")))
  94 buffer_zero_sse2(const void *buf, size_t len)
  95 {
  96     /* Unaligned loads at head/tail.  */
  97     __m128i v = *(__m128i_u *)(buf);
  98     __m128i w = *(__m128i_u *)(buf + len - 16);
  99     /* Align head/tail to 16-byte boundaries.  */
 100     const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
 101     const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
 102     __m128i zero = { 0 };
 103
 104     /* Collect a partial block at tail end.  */
 105     v |= e[-1]; w |= e[-2];
 106     SSE_REASSOC_BARRIER(v, w);
 107     v |= e[-3]; w |= e[-4];
 108     SSE_REASSOC_BARRIER(v, w);
 109     v |= e[-5]; w |= e[-6];
 110     SSE_REASSOC_BARRIER(v, w);
 111     v |= e[-7]; v |= w;
 112
 113     /*
 114      * Loop over complete 128-byte blocks.
 115      * With the head and tail removed, e - p >= 14, so the loop
 116      * must iterate at least once.
 117      */
 118     do {
 119         v = _mm_cmpeq_epi8(v, zero);
 120         if (unlikely(_mm_movemask_epi8(v) != 0xFFFF)) {
 121             return false;
 122         }
 123         v = p[0]; w = p[1];
 124         SSE_REASSOC_BARRIER(v, w);
 125         v |= p[2]; w |= p[3];
 126         SSE_REASSOC_BARRIER(v, w);
 127         v |= p[4]; w |= p[5];
 128         SSE_REASSOC_BARRIER(v, w);
 129         v |= p[6]; w |= p[7];
 130         SSE_REASSOC_BARRIER(v, w);
 131         v |= w;
 132         p += 8;
 133     } while (p < e - 7);
 134
 135     return _mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) == 0xFFFF;
 136 }
 137
 138 #ifdef CONFIG_AVX2_OPT
 139 static bool __attribute__((target("avx2")))
 140 buffer_zero_avx2(const void *buf, size_t len)
 141 {
 142     /* Unaligned loads at head/tail.  */
 143     __m256i v = *(__m256i_u *)(buf);
 144     __m256i w = *(__m256i_u *)(buf + len - 32);
 145     /* Align head/tail to 32-byte boundaries.  */
 146     const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32);
 147     const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32);
 148     __m256i zero = { 0 };
 149
 150     /* Collect a partial block at tail end.  */
 151     v |= e[-1]; w |= e[-2];
 152     SSE_REASSOC_BARRIER(v, w);
 153     v |= e[-3]; w |= e[-4];
 154     SSE_REASSOC_BARRIER(v, w);
 155     v |= e[-5]; w |= e[-6];
 156     SSE_REASSOC_BARRIER(v, w);
 157     v |= e[-7]; v |= w;
 158
 159     /* Loop over complete 256-byte blocks.  */
 160     for (; p < e - 7; p += 8) {
 161         /* PTEST is not profitable here.  */
 162         v = _mm256_cmpeq_epi8(v, zero);
 163         if (unlikely(_mm256_movemask_epi8(v) != 0xFFFFFFFF)) {
 164             return false;
 165         }
 166         v = p[0]; w = p[1];
 167         SSE_REASSOC_BARRIER(v, w);
 168         v |= p[2]; w |= p[3];
 169         SSE_REASSOC_BARRIER(v, w);
 170         v |= p[4]; w |= p[5];
 171         SSE_REASSOC_BARRIER(v, w);
 172         v |= p[6]; w |= p[7];
 173         SSE_REASSOC_BARRIER(v, w);
 174         v |= w;
 175     }
 176
 177     return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, zero)) == 0xFFFFFFFF;
 178 }
 179 #endif /* CONFIG_AVX2_OPT */
 180
 181 static biz_accel_fn const accel_table[] = {
 182     buffer_is_zero_int_ge256,
 183     buffer_zero_sse2,
 184 #ifdef CONFIG_AVX2_OPT
 185     buffer_zero_avx2,
 186 #endif
 187 };
 188
 189 static unsigned best_accel(void)
 190 {
 191     unsigned info = cpuinfo_init();
 192
 193 #ifdef CONFIG_AVX2_OPT
 194     if (info & CPUINFO_AVX2) {
 195         return 2;
 196     }
 197 #endif
 198     return info & CPUINFO_SSE2 ? 1 : 0;
 199 }
 200
 201 #elif defined(__aarch64__) && defined(__ARM_NEON)
 202 #include <arm_neon.h>
 203
 204 /*
 205  * Helper for preventing the compiler from reassociating
 206  * chains of binary vector operations.
 207  */
 208 #define REASSOC_BARRIER(vec0, vec1) asm("" : "+w"(vec0), "+w"(vec1))
 209
 210 static bool buffer_is_zero_simd(const void *buf, size_t len)
 211 {
 212     uint32x4_t t0, t1, t2, t3;
 213
 214     /* Align head/tail to 16-byte boundaries.  */
 215     const uint32x4_t *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
 216     const uint32x4_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
 217
 218     /* Unaligned loads at head/tail.  */
 219     t0 = vld1q_u32(buf) | vld1q_u32(buf + len - 16);
 220
 221     /* Collect a partial block at tail end.  */
 222     t1 = e[-7] | e[-6];
 223     t2 = e[-5] | e[-4];
 224     t3 = e[-3] | e[-2];
 225     t0 |= e[-1];
 226     REASSOC_BARRIER(t0, t1);
 227     REASSOC_BARRIER(t2, t3);
 228     t0 |= t1;
 229     t2 |= t3;
 230     REASSOC_BARRIER(t0, t2);
 231     t0 |= t2;
 232
 233     /*
 234      * Loop over complete 128-byte blocks.
 235      * With the head and tail removed, e - p >= 14, so the loop
 236      * must iterate at least once.
 237      */
 238     do {
 239         /*
 240          * Reduce via UMAXV.  Whatever the actual result,
 241          * it will only be zero if all input bytes are zero.
 242          */
 243         if (unlikely(vmaxvq_u32(t0) != 0)) {
 244             return false;
 245         }
 246
 247         t0 = p[0] | p[1];
 248         t1 = p[2] | p[3];
 249         t2 = p[4] | p[5];
 250         t3 = p[6] | p[7];
 251         REASSOC_BARRIER(t0, t1);
 252         REASSOC_BARRIER(t2, t3);
 253         t0 |= t1;
 254         t2 |= t3;
 255         REASSOC_BARRIER(t0, t2);
 256         t0 |= t2;
 257         p += 8;
 258     } while (p < e - 7);
 259
 260     return vmaxvq_u32(t0) == 0;
 261 }
 262
 263 #define best_accel() 1
 264 static biz_accel_fn const accel_table[] = {
 265     buffer_is_zero_int_ge256,
 266     buffer_is_zero_simd,
 267 };
 268 #else
 269 #define best_accel() 0
 270 static biz_accel_fn const accel_table[1] = {
 271     buffer_is_zero_int_ge256
 272 };
 273 #endif
 274
 275 static biz_accel_fn buffer_is_zero_accel;
 276 static unsigned accel_index;
 277
 278 bool buffer_is_zero_ool(const void *buf, size_t len)
 279 {
 280     if (unlikely(len == 0)) {
 281         return true;
 282     }
 283     if (!buffer_is_zero_sample3(buf, len)) {
 284         return false;
 285     }
 286     /* All bytes are covered for any len <= 3.  */
 287     if (unlikely(len <= 3)) {
 288         return true;
 289     }
 290
 291     if (likely(len >= 256)) {
 292         return buffer_is_zero_accel(buf, len);
 293     }
 294     return buffer_is_zero_int_lt256(buf, len);
 295 }
 296
 297 bool buffer_is_zero_ge256(const void *buf, size_t len)
 298 {
 299     return buffer_is_zero_accel(buf, len);
 300 }
 301
 302 bool test_buffer_is_zero_next_accel(void)
 303 {
 304     if (accel_index != 0) {
 305         buffer_is_zero_accel = accel_table[--accel_index];
 306         return true;
 307     }
 308     return false;
 309 }
 310
 311 static void __attribute__((constructor)) init_accel(void)
 312 {
 313     accel_index = best_accel();
 314     buffer_is_zero_accel = accel_table[accel_index];
 315 }