From dd976cae239a4cc4d0111b3aa4a228afc2c2c308 Mon Sep 17 00:00:00 2001 From: Evgeny Nizhibitsky Date: Sun, 31 Mar 2024 12:23:32 +0100 Subject: [PATCH] wc: simplify and generalize AVX code * src/wc_avx2.c (wc_lines_avx2): Change from _mm256_sub_epi8() + _mm256_sad_epu8() to _mm256_movemask_epi8() + __builtin_popcount(). This will allow adjusting the I/O size above 16KiB. * configure.ac: Align check with routines used in wc_avx2.c. --- configure.ac | 5 +++-- src/wc_avx2.c | 54 ++++++++++-------------------------------------------- 2 files changed, 13 insertions(+), 46 deletions(-) diff --git a/configure.ac b/configure.ac index 9cb6ee149..70c8a65c6 100644 --- a/configure.ac +++ b/configure.ac @@ -658,8 +658,9 @@ AC_LINK_IFELSE( int main (void) { - __m256i a, b; - a = _mm256_sad_epu8 (a, b); + __m256i matches = _mm256_setzero_si256 (); + int mask = _mm256_movemask_epi8 (matches); + int lines = __builtin_popcount (mask); return __builtin_cpu_supports ("avx2"); } ]]) diff --git a/src/wc_avx2.c b/src/wc_avx2.c index cc0454a46..5ec714759 100644 --- a/src/wc_avx2.c +++ b/src/wc_avx2.c @@ -22,10 +22,7 @@ #include -/* This must be below 16 KB (16384) or else the accumulators can - theoretically overflow, producing wrong result. This is 2*32 bytes below, - so there is no single bytes in the optimal case. */ -#define BUFSIZE (16320) +#define BUFSIZE 16384 /* Read FD and return a summary. */ extern struct wc_lines @@ -34,21 +31,11 @@ wc_lines_avx2 (int fd) intmax_t lines = 0; intmax_t bytes = 0; - __m256i - zeroes = _mm256_setzero_si256 (), - endlines = _mm256_set1_epi8 ('\n'); + __m256i endlines = _mm256_set1_epi8 ('\n'); while (true) { - /* Using two parallel accumulators gave a good performance increase. - Adding a third gave no additional benefit, at least on an - Intel Xeon E3-1231v3. Maybe on a newer CPU with additional vector - execution engines it would be a win. */ - __m256i - accumulator = _mm256_setzero_si256 (), - accumulator2 = _mm256_setzero_si256 (), - avx_buf[BUFSIZE / sizeof (__m256i)]; - + __m256i avx_buf[BUFSIZE / sizeof (__m256i)]; ssize_t bytes_read = read (fd, avx_buf, sizeof avx_buf); if (bytes_read <= 0) return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes }; @@ -56,37 +43,16 @@ wc_lines_avx2 (int fd) bytes += bytes_read; __m256i *datap = avx_buf; - while (bytes_read >= 64) + while (bytes_read >= 32) { - __m256i - to_match = _mm256_load_si256 (datap), - to_match2 = _mm256_load_si256 (datap + 1), - matches = _mm256_cmpeq_epi8 (to_match, endlines), - matches2 = _mm256_cmpeq_epi8 (to_match2, endlines); - - /* Compare will set each 8 bit integer in the register to 0xFF - on match. When we subtract it the 8 bit accumulators - will underflow, so this is equal to adding 1. */ - accumulator = _mm256_sub_epi8 (accumulator, matches); - accumulator2 = _mm256_sub_epi8 (accumulator2, matches2); - - datap += 2; - bytes_read -= 64; + __m256i to_match = _mm256_load_si256 (datap); + __m256i matches = _mm256_cmpeq_epi8 (to_match, endlines); + int mask = _mm256_movemask_epi8 (matches); + lines += __builtin_popcount (mask); + datap += 1; + bytes_read -= 32; } - /* Horizontally add all 8 bit integers in the register. */ - accumulator = _mm256_sad_epu8 (accumulator, zeroes); - lines += _mm256_extract_epi16 (accumulator, 0) - + _mm256_extract_epi16 (accumulator, 4) - + _mm256_extract_epi16 (accumulator, 8) - + _mm256_extract_epi16 (accumulator, 12); - - accumulator2 = _mm256_sad_epu8 (accumulator2, zeroes); - lines += _mm256_extract_epi16 (accumulator2, 0) - + _mm256_extract_epi16 (accumulator2, 4) - + _mm256_extract_epi16 (accumulator2, 8) - + _mm256_extract_epi16 (accumulator2, 12); - /* Finish up any left over bytes */ char *end = (char *) datap + bytes_read; for (char *p = (char *) datap; p < end; p++) -- 2.11.4.GIT