From f6ae00c9ac14a4be59aed51b31f09c8604ebc7a7 Mon Sep 17 00:00:00 2001
From: Kristian Nielsen <knmeister@gmail.com>
Date: Wed, 14 Jan 2009 23:10:30 +0100
Subject: [PATCH]  - New pack5() and unpack5() with reduced dependency chain,
 which is    really good (12.9 sec for 2G words at compression to 26.4%).  -
 Experiment with non-temporal stores for pack_overwriteptr.  - Some more
 testing.

---
 .gitignore            |   2 +-
 exp/packed.h          |  54 +++++++++++++++++-
 exp/quick_pack_test.c |  38 +++++++++++++
 exp/scan.cc           | 152 ++++++++++++++++++++++++++++++++++++++++++++++----
 include/bitop.h       |  13 +++++
 5 files changed, 247 insertions(+), 12 deletions(-)
 create mode 100644 exp/quick_pack_test.c
diff --git a/.gitignore b/.gitignore
index 0e6ffc3..96b8d78 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,6 @@
 /exp/packed_test
 /exp/scan
 /exp/raw_mem_bw
-
+/exp/quick_pack_test
 /exp/back
 /foreign
diff --git a/exp/packed.h b/exp/packed.h
index 077b66e..99d9bf4 100644
--- a/exp/packed.h
+++ b/exp/packed.h
@@ -35,6 +35,13 @@
 */
 // #define PREFETCH_EXPERIMENT
 
+/*
+  An experiment with using non-temporal writes in pack_overwriteptr.
+
+  This avoid loading the memory being written into cache.
+*/
+#define NONTEMPORAL_EXPERIMENT
+
 class pack_ptr_base {
  private:
   /* Base pointer, never modified. */
@@ -209,6 +216,7 @@ class templ_pack_writeptr : public B {
   void pack2(uint64_t v);
   void pack3(uint64_t v);
   void pack4(uint64_t v);
+  void pack5(uint64_t v);
 };
 
 /*
@@ -225,6 +233,7 @@ class templ_pack_ptr : public templ_pack_writeptr<B> {
   uint64_t unpack2();
   uint64_t unpack3();
   uint64_t unpack4();
+  uint64_t unpack5();
 };
 
 typedef templ_pack_ptr<pack_ptr_base> pack_ptr;
@@ -353,6 +362,38 @@ templ_pack_ptr<B>::unpack4()
 #undef LOOKUP_UNPACK_GET_OR_VALUE
 #undef LOOKUP_UNPACK_GET_LSHIFT
 
+/*
+  Try hard to reduce the length of the dependency chain:
+   - Don't use mul/div, as they have high latency.
+   - Very simple computation of first part of value (3 ops) to avoid
+     initial stall.
+   - Compute second part independently from first for parallelism.
+
+  Method: 4 bits of header, 4, 8, 12, ..., 64 bits of data, total 8-68 bits.
+
+  This is significantly faster (maybe 33% or so) than pack[1-4].
+*/
+template <class B>
+inline void
+templ_pack_writeptr<B>::pack5(uint64_t v)
+{
+  uint64_t highest_bit;
+  asm( "bsrq %1, %0" : "=r" (highest_bit) : "r" (v|1) : "cc");
+  uint64_t bits1= highest_bit>>2;
+  uint64_t bits2= (highest_bit & 0x3c) + 4;
+  B::putbits(4, bits1);
+  B::putbits(bits2, v);
+}
+
+template <class B>
+inline uint64_t
+templ_pack_ptr<B>::unpack5()
+{
+  uint64_t n= B::getbits(4);
+  return B::getbits((n*4) + 4);
+}
+
+
 
 /*
   Class for stream writing bit-aligned values of arbitrary bit-size <= 64.
@@ -397,7 +438,12 @@ pack_overwriteptr_base::putbits(uint64_t numbits, uint64_t data)
   bitpos= old_bitpos + numbits;
   if (bitpos >= 64)
   {
+#ifdef NONTEMPORAL_EXPERIMENT
+    STORE_NON_TEMPORAL_64(base + i, v);
+    i++;
+#else
     base[i++]= v;
+#endif
     bitpos-= 64;
     if (likely(!(numbits == 64 && old_bitpos == 0)))
       v= data >> (64 - old_bitpos);
@@ -409,5 +455,11 @@ pack_overwriteptr_base::putbits(uint64_t numbits, uint64_t data)
 inline void
 pack_overwriteptr_base::flush()
 {
-  base[i]= v;
+  if (bitpos > 0) {
+#ifdef NONTEMPORAL_EXPERIMENT
+    STORE_NON_TEMPORAL_64(base + i, v);
+#else
+    base[i]= v;
+#endif
+  }
 }
diff --git a/exp/quick_pack_test.c b/exp/quick_pack_test.c
new file mode 100644
index 0000000..40dc337
--- /dev/null
+++ b/exp/quick_pack_test.c
@@ -0,0 +1,38 @@
+#include <stdint.h>
+#include <stdio.h>
+
+/*
+  Test using asm to get best instructions for count-significant-bits and
+  divide-by-multiply. Turns out that especially multiply has a high latency
+  (3-8 cycles depending apparently), so not optimal to use for
+  pack()/unpack().
+*/
+
+#define NOINLINE __attribute__((noinline))
+
+NOINLINE static uint64_t
+get_num_bits(uint64_t v) {
+  uint64_t v2= v|1;
+  uint64_t v3;
+  asm( "bsrq %1, %0" : "=r" (v3) : "r" (v2) : "cc");
+  uint64_t v4= v3 + 8;
+  register uint64_t v5 asm("rax") = v4;
+  register uint64_t v6 asm ("rdx") = 0x1c71c71d00000000;
+  asm( "mulq %0" : "=r" (v6), "=r" (v5) : "r" (v5), "r" (v6));
+  return v6;
+}
+
+int
+main(int argc, char *argv[]) {
+  uint64_t i;
+  for (i = 0; i <= 64; i++) {
+    uint64_t v;
+    if (i == 0) {
+      v = 0;
+    } else {
+      v = (uint64_t)1 << (i-1);
+    }
+    printf("%lu\t%16lx\t%lu\n", i, v, get_num_bits(v));
+  }
+  return 0;
+}
diff --git a/exp/scan.cc b/exp/scan.cc
index ca76b0e..affdf5d 100644
--- a/exp/scan.cc
+++ b/exp/scan.cc
@@ -21,6 +21,7 @@
 #include <stdio.h>
 #include <sys/time.h>
 #include <time.h>
+#include <string.h>
 
 #include "packed_mask.h"
 #include "packed_nocond.h"
@@ -132,6 +133,20 @@ gettime(void)
 }
 
 NOINLINE static uint64_t
+run_sum(uint count, uint loops, uint64_t *base)
+{
+  uint64_t sum= 0;
+  for (uint j=  loops; j > 0; j--)
+  {
+    for (uint i= 0; i < count; i++)
+    {
+      sum+= base[i];
+    }
+  }
+  return sum;
+}
+
+NOINLINE static uint64_t
 run_sum_del(uint count, uint loops, uint64_t *base)
 {
   uint64_t sum= 0;
@@ -236,6 +251,21 @@ run_sum_packed(uint count, uint loops, uint64_t *base)
   return sum;
 }
 
+NOINLINE static uint64_t
+run_sum_packed5(uint count, uint loops, uint64_t *base)
+{
+  uint64_t sum= 0;
+  for (uint j= loops; j > 0; j--)
+  {
+    pack_ptr p(base);
+    for (uint i= count; i > 0; i--)
+    {
+      sum+= p.unpack5();
+    }
+  }
+  return sum;
+}
+
 NOINLINE static void
 run_copy(uint count, uint loops, uint64_t *dst, uint64_t *src)
 {
@@ -252,6 +282,62 @@ run_copy(uint count, uint loops, uint64_t *dst, uint64_t *src)
   }
 }
 
+#include <xmmintrin.h>
+void _mm_stream_pi(__m64 *p, __m64 a);
+
+NOINLINE static void
+run_copy_non_temporal(uint count, uint loops, uint64_t *dst, uint64_t *src)
+{
+  if ((count & 7) != 0) {
+    fprintf(stderr, "run_copy_non_temporal(): Count must be divisible by 8 "
+            "for this test, but %u isn't.\n", count);
+    return;
+  }
+  count /= 8;
+
+  for (uint j= loops; j > 0; j--)
+  {
+    __m64 *mm_dst= (__m64 *)dst;
+    for (uint i= 0; i < count; i++)
+    {
+      uint src_i= i & 511;
+      _mm_stream_pi(mm_dst, (__m64)(src[src_i]));
+      _mm_stream_pi(mm_dst+1, (__m64)(src[src_i+1]));
+      _mm_stream_pi(mm_dst+2, (__m64)(src[src_i+2]));
+      _mm_stream_pi(mm_dst+3, (__m64)(src[src_i+3]));
+      _mm_stream_pi(mm_dst+4, (__m64)(src[src_i+4]));
+      _mm_stream_pi(mm_dst+5, (__m64)(src[src_i+5]));
+      _mm_stream_pi(mm_dst+6, (__m64)(src[src_i+6]));
+      _mm_stream_pi(mm_dst+7, (__m64)(src[src_i+7]));
+      mm_dst += 8;
+    }
+  }
+}
+
+NOINLINE static void
+run_copy_non_temporal_no_unroll(uint count, uint loops, uint64_t *dst, uint64_t *src)
+{
+  if ((count & 7) != 0) {
+    fprintf(stderr, "run_copy_non_temporal(): Count must be divisible by 8 "
+            "for this test, but %u isn't.\n", count);
+    return;
+  }
+  count /= 8;
+
+  for (uint j= loops; j > 0; j--)
+  {
+    __m64 *mm_dst= (__m64 *)dst;
+    for (uint i= 0; i < count; i++)
+    {
+      uint src_i= i & 511;
+      for (uint j= 0; j < 8; j++) {
+        _mm_stream_pi(mm_dst+j, (__m64)(src[src_i+j]));
+      }
+      mm_dst += 8;
+    }
+  }
+}
+
 NOINLINE static void
 run_pack1(uint count, uint loops, uint64_t *dst, uint64_t *src)
 {
@@ -308,6 +394,20 @@ run_pack4(uint count, uint loops, uint64_t *dst, uint64_t *src)
   }
 }
 
+NOINLINE static void
+run_pack5(uint count, uint loops, uint64_t *dst, uint64_t *src)
+{
+  for (uint j= loops; j > 0; j--)
+  {
+    pack_overwriteptr p(dst);
+    for (uint i= 0; i < count; i++)
+    {
+      p.pack5(src[i&511]);
+    }
+    p.flush();
+  }
+}
+
 int
 main(int argc, char *argv[])
 {
@@ -332,16 +432,46 @@ main(int argc, char *argv[])
   fill_expanded(b1, count);
   printf("Expanded: %ld bytes\n", (long)(count*sizeof(*b1)));
 
+  /* Fill in some prior data to check correctness on non-zero destination. */
+  memcpy(b2, b1, count*sizeof(*b1));
   pack_overwriteptr p(b2);
   fill_packed(p, count);
   printf("Packed: %ld bytes\n", (long)(p.used()*sizeof(*b2)));
 
+  /* Check correctness of pack5() and unpack5(). */
+  memcpy(b3, b1, count*sizeof(*b1));
+  pack_overwriteptr p5a(b3);
+  for (uint i= 0; i < count; i++) {
+    p5a.pack5(b1[i]);
+  }
+  p5a.flush();
+  printf("Packed: %ld bytes\n", (long)(p5a.used()*sizeof(*b2)));
+  pack_ptr p5b(b3);
+  for (uint i= 0; i < count; i++) {
+    uint64_t v= p5b.unpack5();
+    if (v != b1[i]) {
+      printf("DIFF! i=%u exp=%lu (0x%lx) pack5=%lu (0x%lx)\n",
+             i, b1[i], b1[i], v, v);
+      exit(1);
+    }
+  }
+
+  uint64_t sum;
+
   /* Do in a loop to time packing. */
   double start1= gettime();
   run_copy(count, loops, b3, b1);
   double elapsed1= gettime() - start1;
   printf("copy: time=%g\n", elapsed1);
   start1= gettime();
+  run_copy_non_temporal(count, loops, b3, b1);
+  elapsed1= gettime() - start1;
+  printf("copy_non_temporal: time=%g\n", elapsed1);
+  start1= gettime();
+  run_copy_non_temporal_no_unroll(count, loops, b3, b1);
+  elapsed1= gettime() - start1;
+  printf("copy_non_temporal_no_unroll: time=%g\n", elapsed1);
+  start1= gettime();
   run_pack1(count, loops, b3, b1);
   elapsed1= gettime() - start1;
   printf("pack1: time=%g\n", elapsed1);
@@ -357,6 +487,14 @@ main(int argc, char *argv[])
   run_pack4(count, loops, b3, b1);
   elapsed1= gettime() - start1;
   printf("pack4: time=%g\n", elapsed1);
+  start1= gettime();
+  run_pack5(count, loops, b3, b1);
+  elapsed1= gettime() - start1;
+  printf("pack5: time=%g\n", elapsed1);
+  start1= gettime();
+  sum= run_sum_packed5(count, loops, b3);
+  elapsed1= gettime() - start1;
+  printf("exp: unpack5() sum=%lu time=%g\n", (unsigned long)sum, elapsed1);
 
   pack_ptr_del p0(b3);
   fill_packed_del(p0, count);
@@ -424,20 +562,14 @@ main(int argc, char *argv[])
   }
 
   /* Time looping over data. */
-  uint64_t sum= 0;
+  sum= 0;
   double start= gettime();
   double elapsed;
-#if 0
-  for (uint j= 0; j < loops; j++)
-  {
-    for (uint i= 0; i < count; i++)
-    {
-      sum+= b1[i];
-    }
-  }
+  sum= run_sum(count, loops, b1);
   elapsed= gettime() - start;
   printf("exp: sum=%lu time=%g\n", (unsigned long)sum, elapsed);
 
+#if 0
   uint32_t sum32= 0;
   start= gettime();
   for (uint j= 0; j < loops; j++)
@@ -505,12 +637,12 @@ main(int argc, char *argv[])
   elapsed= gettime() - start;
   printf("getbits_unsigned(33): sum=%lu time=%g\n", (unsigned long)sum, elapsed);
 
-#if 0
   start= gettime();
   sum= run_sum_packed(count, loops, b2);
   elapsed= gettime() - start;
   printf("pack: sum=%lu time=%g\n", (unsigned long)sum, elapsed);
 
+#if 0
   start= gettime();
   sum= run_sum_getbits_nocond2(count, loops, b2);
   elapsed= gettime() - start;
diff --git a/include/bitop.h b/include/bitop.h
index fbbb419..085e348 100644
--- a/include/bitop.h
+++ b/include/bitop.h
@@ -45,3 +45,16 @@
 #else
 #error No COUNT_LEADING_ZEROS_64 defined for compiler
 #endif
+
+/*
+  Non-temporal stores.
+
+  These write directly to memory bypassing cache.
+*/
+
+#ifdef __GNUC__
+#define STORE_NON_TEMPORAL_64(p, v) asm("movnti %1, %0" : "=m" (*(p)) : "r" (v))
+//  __builtin_ia32_movntq((long long unsigned int*)(p), (long long unsigned)(v))
+#else
+#define STORE_NON_TEMPORAL_64(p, v) (*(uint64_t *)(p) = (uint64_t)(v))
+#endif
-- 
2.11.4.GIT