From f6ae00c9ac14a4be59aed51b31f09c8604ebc7a7 Mon Sep 17 00:00:00 2001 From: Kristian Nielsen Date: Wed, 14 Jan 2009 23:10:30 +0100 Subject: [PATCH] - New pack5() and unpack5() with reduced dependency chain, which is really good (12.9 sec for 2G words at compression to 26.4%). - Experiment with non-temporal stores for pack_overwriteptr. - Some more testing. --- .gitignore | 2 +- exp/packed.h | 54 +++++++++++++++++- exp/quick_pack_test.c | 38 +++++++++++++ exp/scan.cc | 152 ++++++++++++++++++++++++++++++++++++++++++++++---- include/bitop.h | 13 +++++ 5 files changed, 247 insertions(+), 12 deletions(-) create mode 100644 exp/quick_pack_test.c diff --git a/.gitignore b/.gitignore index 0e6ffc3..96b8d78 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,6 @@ /exp/packed_test /exp/scan /exp/raw_mem_bw - +/exp/quick_pack_test /exp/back /foreign diff --git a/exp/packed.h b/exp/packed.h index 077b66e..99d9bf4 100644 --- a/exp/packed.h +++ b/exp/packed.h @@ -35,6 +35,13 @@ */ // #define PREFETCH_EXPERIMENT +/* + An experiment with using non-temporal writes in pack_overwriteptr. + + This avoid loading the memory being written into cache. +*/ +#define NONTEMPORAL_EXPERIMENT + class pack_ptr_base { private: /* Base pointer, never modified. */ @@ -209,6 +216,7 @@ class templ_pack_writeptr : public B { void pack2(uint64_t v); void pack3(uint64_t v); void pack4(uint64_t v); + void pack5(uint64_t v); }; /* @@ -225,6 +233,7 @@ class templ_pack_ptr : public templ_pack_writeptr { uint64_t unpack2(); uint64_t unpack3(); uint64_t unpack4(); + uint64_t unpack5(); }; typedef templ_pack_ptr pack_ptr; @@ -353,6 +362,38 @@ templ_pack_ptr::unpack4() #undef LOOKUP_UNPACK_GET_OR_VALUE #undef LOOKUP_UNPACK_GET_LSHIFT +/* + Try hard to reduce the length of the dependency chain: + - Don't use mul/div, as they have high latency. + - Very simple computation of first part of value (3 ops) to avoid + initial stall. + - Compute second part independently from first for parallelism. + + Method: 4 bits of header, 4, 8, 12, ..., 64 bits of data, total 8-68 bits. + + This is significantly faster (maybe 33% or so) than pack[1-4]. +*/ +template +inline void +templ_pack_writeptr::pack5(uint64_t v) +{ + uint64_t highest_bit; + asm( "bsrq %1, %0" : "=r" (highest_bit) : "r" (v|1) : "cc"); + uint64_t bits1= highest_bit>>2; + uint64_t bits2= (highest_bit & 0x3c) + 4; + B::putbits(4, bits1); + B::putbits(bits2, v); +} + +template +inline uint64_t +templ_pack_ptr::unpack5() +{ + uint64_t n= B::getbits(4); + return B::getbits((n*4) + 4); +} + + /* Class for stream writing bit-aligned values of arbitrary bit-size <= 64. @@ -397,7 +438,12 @@ pack_overwriteptr_base::putbits(uint64_t numbits, uint64_t data) bitpos= old_bitpos + numbits; if (bitpos >= 64) { +#ifdef NONTEMPORAL_EXPERIMENT + STORE_NON_TEMPORAL_64(base + i, v); + i++; +#else base[i++]= v; +#endif bitpos-= 64; if (likely(!(numbits == 64 && old_bitpos == 0))) v= data >> (64 - old_bitpos); @@ -409,5 +455,11 @@ pack_overwriteptr_base::putbits(uint64_t numbits, uint64_t data) inline void pack_overwriteptr_base::flush() { - base[i]= v; + if (bitpos > 0) { +#ifdef NONTEMPORAL_EXPERIMENT + STORE_NON_TEMPORAL_64(base + i, v); +#else + base[i]= v; +#endif + } } diff --git a/exp/quick_pack_test.c b/exp/quick_pack_test.c new file mode 100644 index 0000000..40dc337 --- /dev/null +++ b/exp/quick_pack_test.c @@ -0,0 +1,38 @@ +#include +#include + +/* + Test using asm to get best instructions for count-significant-bits and + divide-by-multiply. Turns out that especially multiply has a high latency + (3-8 cycles depending apparently), so not optimal to use for + pack()/unpack(). +*/ + +#define NOINLINE __attribute__((noinline)) + +NOINLINE static uint64_t +get_num_bits(uint64_t v) { + uint64_t v2= v|1; + uint64_t v3; + asm( "bsrq %1, %0" : "=r" (v3) : "r" (v2) : "cc"); + uint64_t v4= v3 + 8; + register uint64_t v5 asm("rax") = v4; + register uint64_t v6 asm ("rdx") = 0x1c71c71d00000000; + asm( "mulq %0" : "=r" (v6), "=r" (v5) : "r" (v5), "r" (v6)); + return v6; +} + +int +main(int argc, char *argv[]) { + uint64_t i; + for (i = 0; i <= 64; i++) { + uint64_t v; + if (i == 0) { + v = 0; + } else { + v = (uint64_t)1 << (i-1); + } + printf("%lu\t%16lx\t%lu\n", i, v, get_num_bits(v)); + } + return 0; +} diff --git a/exp/scan.cc b/exp/scan.cc index ca76b0e..affdf5d 100644 --- a/exp/scan.cc +++ b/exp/scan.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include "packed_mask.h" #include "packed_nocond.h" @@ -132,6 +133,20 @@ gettime(void) } NOINLINE static uint64_t +run_sum(uint count, uint loops, uint64_t *base) +{ + uint64_t sum= 0; + for (uint j= loops; j > 0; j--) + { + for (uint i= 0; i < count; i++) + { + sum+= base[i]; + } + } + return sum; +} + +NOINLINE static uint64_t run_sum_del(uint count, uint loops, uint64_t *base) { uint64_t sum= 0; @@ -236,6 +251,21 @@ run_sum_packed(uint count, uint loops, uint64_t *base) return sum; } +NOINLINE static uint64_t +run_sum_packed5(uint count, uint loops, uint64_t *base) +{ + uint64_t sum= 0; + for (uint j= loops; j > 0; j--) + { + pack_ptr p(base); + for (uint i= count; i > 0; i--) + { + sum+= p.unpack5(); + } + } + return sum; +} + NOINLINE static void run_copy(uint count, uint loops, uint64_t *dst, uint64_t *src) { @@ -252,6 +282,62 @@ run_copy(uint count, uint loops, uint64_t *dst, uint64_t *src) } } +#include +void _mm_stream_pi(__m64 *p, __m64 a); + +NOINLINE static void +run_copy_non_temporal(uint count, uint loops, uint64_t *dst, uint64_t *src) +{ + if ((count & 7) != 0) { + fprintf(stderr, "run_copy_non_temporal(): Count must be divisible by 8 " + "for this test, but %u isn't.\n", count); + return; + } + count /= 8; + + for (uint j= loops; j > 0; j--) + { + __m64 *mm_dst= (__m64 *)dst; + for (uint i= 0; i < count; i++) + { + uint src_i= i & 511; + _mm_stream_pi(mm_dst, (__m64)(src[src_i])); + _mm_stream_pi(mm_dst+1, (__m64)(src[src_i+1])); + _mm_stream_pi(mm_dst+2, (__m64)(src[src_i+2])); + _mm_stream_pi(mm_dst+3, (__m64)(src[src_i+3])); + _mm_stream_pi(mm_dst+4, (__m64)(src[src_i+4])); + _mm_stream_pi(mm_dst+5, (__m64)(src[src_i+5])); + _mm_stream_pi(mm_dst+6, (__m64)(src[src_i+6])); + _mm_stream_pi(mm_dst+7, (__m64)(src[src_i+7])); + mm_dst += 8; + } + } +} + +NOINLINE static void +run_copy_non_temporal_no_unroll(uint count, uint loops, uint64_t *dst, uint64_t *src) +{ + if ((count & 7) != 0) { + fprintf(stderr, "run_copy_non_temporal(): Count must be divisible by 8 " + "for this test, but %u isn't.\n", count); + return; + } + count /= 8; + + for (uint j= loops; j > 0; j--) + { + __m64 *mm_dst= (__m64 *)dst; + for (uint i= 0; i < count; i++) + { + uint src_i= i & 511; + for (uint j= 0; j < 8; j++) { + _mm_stream_pi(mm_dst+j, (__m64)(src[src_i+j])); + } + mm_dst += 8; + } + } +} + NOINLINE static void run_pack1(uint count, uint loops, uint64_t *dst, uint64_t *src) { @@ -308,6 +394,20 @@ run_pack4(uint count, uint loops, uint64_t *dst, uint64_t *src) } } +NOINLINE static void +run_pack5(uint count, uint loops, uint64_t *dst, uint64_t *src) +{ + for (uint j= loops; j > 0; j--) + { + pack_overwriteptr p(dst); + for (uint i= 0; i < count; i++) + { + p.pack5(src[i&511]); + } + p.flush(); + } +} + int main(int argc, char *argv[]) { @@ -332,16 +432,46 @@ main(int argc, char *argv[]) fill_expanded(b1, count); printf("Expanded: %ld bytes\n", (long)(count*sizeof(*b1))); + /* Fill in some prior data to check correctness on non-zero destination. */ + memcpy(b2, b1, count*sizeof(*b1)); pack_overwriteptr p(b2); fill_packed(p, count); printf("Packed: %ld bytes\n", (long)(p.used()*sizeof(*b2))); + /* Check correctness of pack5() and unpack5(). */ + memcpy(b3, b1, count*sizeof(*b1)); + pack_overwriteptr p5a(b3); + for (uint i= 0; i < count; i++) { + p5a.pack5(b1[i]); + } + p5a.flush(); + printf("Packed: %ld bytes\n", (long)(p5a.used()*sizeof(*b2))); + pack_ptr p5b(b3); + for (uint i= 0; i < count; i++) { + uint64_t v= p5b.unpack5(); + if (v != b1[i]) { + printf("DIFF! i=%u exp=%lu (0x%lx) pack5=%lu (0x%lx)\n", + i, b1[i], b1[i], v, v); + exit(1); + } + } + + uint64_t sum; + /* Do in a loop to time packing. */ double start1= gettime(); run_copy(count, loops, b3, b1); double elapsed1= gettime() - start1; printf("copy: time=%g\n", elapsed1); start1= gettime(); + run_copy_non_temporal(count, loops, b3, b1); + elapsed1= gettime() - start1; + printf("copy_non_temporal: time=%g\n", elapsed1); + start1= gettime(); + run_copy_non_temporal_no_unroll(count, loops, b3, b1); + elapsed1= gettime() - start1; + printf("copy_non_temporal_no_unroll: time=%g\n", elapsed1); + start1= gettime(); run_pack1(count, loops, b3, b1); elapsed1= gettime() - start1; printf("pack1: time=%g\n", elapsed1); @@ -357,6 +487,14 @@ main(int argc, char *argv[]) run_pack4(count, loops, b3, b1); elapsed1= gettime() - start1; printf("pack4: time=%g\n", elapsed1); + start1= gettime(); + run_pack5(count, loops, b3, b1); + elapsed1= gettime() - start1; + printf("pack5: time=%g\n", elapsed1); + start1= gettime(); + sum= run_sum_packed5(count, loops, b3); + elapsed1= gettime() - start1; + printf("exp: unpack5() sum=%lu time=%g\n", (unsigned long)sum, elapsed1); pack_ptr_del p0(b3); fill_packed_del(p0, count); @@ -424,20 +562,14 @@ main(int argc, char *argv[]) } /* Time looping over data. */ - uint64_t sum= 0; + sum= 0; double start= gettime(); double elapsed; -#if 0 - for (uint j= 0; j < loops; j++) - { - for (uint i= 0; i < count; i++) - { - sum+= b1[i]; - } - } + sum= run_sum(count, loops, b1); elapsed= gettime() - start; printf("exp: sum=%lu time=%g\n", (unsigned long)sum, elapsed); +#if 0 uint32_t sum32= 0; start= gettime(); for (uint j= 0; j < loops; j++) @@ -505,12 +637,12 @@ main(int argc, char *argv[]) elapsed= gettime() - start; printf("getbits_unsigned(33): sum=%lu time=%g\n", (unsigned long)sum, elapsed); -#if 0 start= gettime(); sum= run_sum_packed(count, loops, b2); elapsed= gettime() - start; printf("pack: sum=%lu time=%g\n", (unsigned long)sum, elapsed); +#if 0 start= gettime(); sum= run_sum_getbits_nocond2(count, loops, b2); elapsed= gettime() - start; diff --git a/include/bitop.h b/include/bitop.h index fbbb419..085e348 100644 --- a/include/bitop.h +++ b/include/bitop.h @@ -45,3 +45,16 @@ #else #error No COUNT_LEADING_ZEROS_64 defined for compiler #endif + +/* + Non-temporal stores. + + These write directly to memory bypassing cache. +*/ + +#ifdef __GNUC__ +#define STORE_NON_TEMPORAL_64(p, v) asm("movnti %1, %0" : "=m" (*(p)) : "r" (v)) +// __builtin_ia32_movntq((long long unsigned int*)(p), (long long unsigned)(v)) +#else +#define STORE_NON_TEMPORAL_64(p, v) (*(uint64_t *)(p) = (uint64_t)(v)) +#endif -- 2.11.4.GIT