exp/packed.h

   1 /*
   2   Copyright 2008 Kristian Nielsen
   3
   4   This file is part of BeeDB.
   5
   6   Foobar is free software: you can redistribute it and/or modify
   7   it under the terms of the GNU General Public License as published by
   8   the Free Software Foundation, either version 2 of the License, or
   9   (at your option) any later version.
  10
  11   Foobar is distributed in the hope that it will be useful,
  12   but WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14   GNU General Public License for more details.
  15
  16   You should have received a copy of the GNU General Public License
  17   along with Foobar.  If not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 /*
  21   Try for a version which shifts the cached 64-bit word in-place, making
  22   available the result word without having to compute a shift first,
  23   hopefully reducing the length of the dependency chain.
  24
  25   Also use uint64_t throughout, seems to sometimes save the odd instruction
  26   otherwise needed for sign extension.
  27 */
  28
  29 #include "bitop.h"
  30
  31 /*
  32   This was an experiment to pre-fetch one 64-bit word ahead.
  33   However, in addition to a nasty requirement to keep 8 bytes of extra readable
  34   memory at the end of data, it also turns out to be about 3% slower!
  35 */
  36 // #define PREFETCH_EXPERIMENT
  37
  38 /*
  39   An experiment with using non-temporal writes in pack_overwriteptr.
  40
  41   This avoid loading the memory being written into cache.
  42 */
  43 //#define NONTEMPORAL_EXPERIMENT
  44
  45 class pack_ptr_base {
  46  private:
  47   /* Base pointer, never modified. */
  48   uint64_t * const base;
  49   /* Current value, all but lower BITSLEFT bits are zero. */
  50   uint64_t v;
  51 #ifdef PREFETCH_EXPERIMENT
  52   uint64_t v_next;
  53 #endif
  54   /* Index into BASE of next word to fetch. */
  55   uint64_t i;
  56   /* Number of bits left in current word V. */
  57   uint64_t bitsleft;
  58
  59  public:
  60   pack_ptr_base(uint64_t *_base);
  61   /*
  62     This constructor sets up to start reading from _bit_ position relative to
  63     BASE. Ie. we might pass start_bitpos=100 to skip the first 5 20-bit words.
  64   */
  65   pack_ptr_base(uint64_t *_base, uint64_t start_bitpos);
  66
  67   void seek(uint64_t bitpos);
  68   uint64_t getbits(uint64_t numbits);
  69   /*
  70     This method needs to load + store the value at every invocation, due to
  71     possibility of seek()/getbits() intermixed with putbits().
  72
  73     For more efficient multiple putbits() in serial, use pack_writeptr or
  74     pack_overwriteptr.
  75   */
  76   void putbits(uint64_t numbits, uint64_t data);
  77   /* More efficient versions for when numbits==64. */
  78   uint64_t getbits64();
  79   void putbits64(uint64_t data);
  80
  81   uint64_t used_words() { return i; }
  82   uint64_t used_bits() { return (i<<6) - bitsleft; }
  83 };
  84
  85
  86 inline
  87 pack_ptr_base::pack_ptr_base(uint64_t *_base) :
  88   base(_base), v(0), i(0), bitsleft(0)
  89 {
  90 #ifdef PREFETCH_EXPERIMENT
  91   v_next= base[i++];
  92 #endif
  93   /*
  94     We do not pre-load the first word, as we use the invariant (bitsleft < 64)
  95     to avoid a special case for undefined shift-by-64-bit.
  96   */
  97 }
  98
  99 inline
 100 pack_ptr_base::pack_ptr_base(uint64_t *_base, uint64_t start_bitpos) :
 101   base(_base)
 102 {
 103   seek(start_bitpos);
 104 }
 105
 106 inline void
 107 pack_ptr_base::seek(uint64_t bitpos)
 108 {
 109   /*
 110     Since we cannot have bitsleft==64, in the case where we start on a word
 111     boundary we need to point to the previous word and set bitsleft=0.
 112   */
 113   uint64_t fraction= bitpos & 63;
 114   i= bitpos >> 6;
 115   if (likely(fraction > 0))
 116   {
 117     bitsleft= 64 - fraction;
 118     v= base[i++] >> fraction;
 119   }
 120   else
 121   {
 122     bitsleft= 0;
 123     v= 0;
 124   }
 125 #ifdef PREFETCH_EXPERIMENT
 126   v_next= base[i++];
 127 #endif
 128 }
 129
 130 inline uint64_t
 131 pack_ptr_base::getbits(uint64_t numbits)
 132 {
 133   if (bitsleft >= numbits)
 134   {
 135     uint64_t r= v & (~(uint64_t)0 >> (64 - numbits));
 136     /*
 137       The value of bitsleft is at most 63, so for numbits==64 we never get
 138       here with an otherwise undefined shift-by-64-bit operation.
 139     */
 140     v>>= numbits;
 141     bitsleft-= numbits;
 142     return r;
 143   }
 144   else
 145   {
 146     uint64_t r= v;
 147 #ifdef PREFETCH_EXPERIMENT
 148     v= v_next;
 149     v_next= base[i++];
 150 #else
 151     v= base[i++];
 152 #endif
 153     r= (r | (v << bitsleft)) & (~(uint64_t)0 >> (64 - numbits));
 154     /*
 155       Guard against undefined shift-by-64-bit.
 156       In many cases the numbits==64 condition will be statically determined
 157       by the compiler, so no branch will be generated, or in any case the CPU
 158       should be safe to predict this conditional as true.
 159     */
 160     if (likely(!(numbits == 64 && bitsleft == 0)))
 161       v>>= numbits - bitsleft;
 162     else
 163       v= 0;
 164     bitsleft= bitsleft + (64 - numbits);
 165     return r;
 166   }
 167 }
 168
 169 inline uint64_t
 170 pack_ptr_base::getbits64()
 171 {
 172   uint64_t r= v;
 173 #ifdef PREFETCH_EXPERIMENT
 174   v= v_next;
 175   v_next= base[i++];
 176 #else
 177   v= base[i++];
 178 #endif
 179   r|= v << bitsleft;
 180   /* Guard against undefined shift-by-64-bit. */
 181   if (bitsleft != 0)
 182     v>>= 64 - bitsleft;
 183   else
 184     v= 0;
 185   return r;
 186 }
 187
 188 inline void
 189 pack_ptr_base::putbits(uint64_t numbits, uint64_t data)
 190 {
 191   if (bitsleft > 0)
 192   {
 193     uint64_t val= base[i-1];
 194     uint64_t mask= (~(uint64_t)0 >> (64 - numbits)) << (64 - bitsleft);
 195     val= (val & ~mask) | (data << (64 - bitsleft));
 196     if (bitsleft >= numbits)
 197     {
 198       bitsleft-= numbits;
 199       v>>= numbits;
 200     }
 201     else
 202     {
 203       uint64_t rest= numbits - bitsleft;
 204       bitsleft+= 64 - numbits;
 205       mask= ~(uint64_t)0 >> bitsleft;
 206       v= (base[i] & ~mask) | (data >> rest);
 207       base[i++]= v;
 208       v>>= rest;
 209     }
 210   }
 211   else
 212   {
 213     uint64_t mask= ~(uint64_t)0 >> (64 - numbits);
 214     v= (base[i] & ~mask) | data;
 215     base[i++]= v;
 216     if (numbits != 64)
 217       v>>= numbits;
 218     else
 219       v= 0;
 220     bitsleft= 64 - numbits;
 221   }
 222 }
 223
 224 inline void
 225 pack_ptr_base::putbits64(uint64_t data)
 226 {
 227   if (bitsleft > 0)
 228   {
 229     uint64_t val= base[i-1];
 230     uint64_t mask= ~(uint64_t)0 << (64 - bitsleft);
 231     val= (val & ~mask) | (data << (64 - bitsleft));
 232     uint64_t rest= 64 - bitsleft;
 233     mask= ~(uint64_t)0 >> bitsleft;
 234     v= (base[i] & ~mask) | (data >> rest);
 235     base[i++]= v;
 236     v>>= rest;
 237   }
 238   else
 239   {
 240     base[i++]= data;
 241     v= 0;
 242   }
 243 }
 244
 245
 246 /*
 247   This template adds pack() methods to a base class that supports putbits().
 248 */
 249 template <class B>
 250 class templ_pack_writeptr : public B {
 251  public:
 252   templ_pack_writeptr(uint64_t *base) : B(base) { }
 253   /*
 254     This constructor with specified start position will only be available in
 255     some instantiations.
 256   */
 257   templ_pack_writeptr(uint64_t *base, uint64_t start_bitpos) :
 258     B(base, start_bitpos) { }
 259
 260   void pack1(uint64_t v);
 261   void pack2(uint64_t v);
 262   void pack3(uint64_t v);
 263   void pack4(uint64_t v);
 264   void pack5(uint64_t v);
 265
 266   /*
 267     These are used to efficiently pack/unpack signed numbers.
 268     They convert to a representation where numbers of small magnitude (both
 269     negative and positive numbers) use few significant bits (most leading bits
 270     are zero).
 271   */
 272   static uint64_t signed_encode(int64_t i);
 273   static int64_t signed_decode(uint64_t u);
 274 };
 275
 276 /*
 277   This template adds both pack() and unpack() to a base class that supports
 278   both putbits() and getbits().
 279 */
 280 template <class B>
 281 class templ_pack_ptr : public templ_pack_writeptr<B> {
 282  public:
 283   templ_pack_ptr(uint64_t *base) : templ_pack_writeptr<B>(base) { }
 284   templ_pack_ptr(uint64_t *base, uint64_t start_bitpos) :
 285     templ_pack_writeptr<B>(base, start_bitpos) { }
 286   uint64_t unpack1();
 287   uint64_t unpack2();
 288   uint64_t unpack3();
 289   uint64_t unpack4();
 290   uint64_t unpack5();
 291 };
 292
 293 typedef templ_pack_ptr<pack_ptr_base> pack_ptr;
 294
 295
 296 template <class B>
 297 inline void
 298 templ_pack_writeptr<B>::pack1(uint64_t v)
 299 {
 300   /*
 301     Compute the 3-bit header counter C, where we store C*9+1 bits.
 302     We have C = ((#bits - 1) + 8) / 9   [rounding down].
 303     And #bits = 64 - #leading zeroes, and 64 - 1 + 8 == 71.
 304     We use |1 to be sure to get at least a size of one bit (__builtin_clzl is
 305     undefined for argument of zero).
 306   */
 307   uint64_t bits= (71 - COUNT_LEADING_ZEROS_64(v | 1))/9;
 308   B::putbits(3, bits);
 309   B::putbits(bits*9+1, v);
 310 }
 311
 312 template <class B>
 313 inline uint64_t
 314 templ_pack_ptr<B>::unpack1()
 315 {
 316   uint64_t n= B::getbits(3);
 317   return B::getbits(n*9+1);
 318 }
 319
 320 template <class B>
 321 inline void
 322 templ_pack_writeptr<B>::pack2(uint64_t v)
 323 {
 324   uint64_t bits= (79 - COUNT_LEADING_ZEROS_64(v | 1))/20;
 325   B::putbits(2, bits);
 326   B::putbits(bits*20+4, v);
 327 }
 328
 329 template <class B>
 330 inline uint64_t
 331 templ_pack_ptr<B>::unpack2()
 332 {
 333   uint64_t n= B::getbits(2);
 334   return B::getbits(n*20+4);
 335 }
 336
 337 #define LOOKUP_PACK_GET_TAG_BITS(v)   ( (v) & 0xffff )
 338 #define LOOKUP_PACK_GET_TAG_VALUE(v)  ( (v) >> 24 )
 339 #define LOOKUP_PACK_GET_DATA_BITS(v)  ( ((v) >> 16) & 0xff )
 340
 341 #define LOOKUP_UNPACK_GET_BITS(v)     ( (v) & 0xffff )
 342 #define LOOKUP_UNPACK_GET_OR_VALUE(v) ( (v) >> 24 )
 343 #define LOOKUP_UNPACK_GET_LSHIFT(v)   ( ((v) >> 16) & 0xff )
 344
 345 extern uint32_t lookup_table_pack3[];
 346 template <class B>
 347 inline void
 348 templ_pack_writeptr<B>::pack3(uint64_t v)
 349 {
 350   uint bits= COUNT_LEADING_ZEROS_64(v | 1);
 351   uint32_t lookup_word= lookup_table_pack3[bits];
 352   uint64_t tag_bits= LOOKUP_PACK_GET_TAG_BITS(lookup_word);
 353   uint64_t tag_value= LOOKUP_PACK_GET_TAG_VALUE(lookup_word);
 354   uint64_t data_bits= LOOKUP_PACK_GET_DATA_BITS(lookup_word);
 355   B::putbits(tag_bits, tag_value);
 356   B::putbits(data_bits, v);
 357 }
 358
 359 extern uint32_t lookup_table_unpack3[];
 360 template <class B>
 361 inline uint64_t
 362 templ_pack_ptr<B>::unpack3()
 363 {
 364   uint prefix= B::getbits(3);
 365   uint32_t lookup_word = lookup_table_unpack3[prefix];
 366   uint64_t bits= LOOKUP_UNPACK_GET_BITS(lookup_word);
 367   uint64_t or_value= LOOKUP_UNPACK_GET_OR_VALUE(lookup_word);
 368 #ifndef WORDS_BIGENDIAN
 369   uint64_t lshift= LOOKUP_UNPACK_GET_LSHIFT(lookup_word);
 370 #endif
 371   return ( B::getbits(bits)
 372 #ifndef WORDS_BIGENDIAN
 373            << lshift
 374 #endif
 375          ) | or_value;
 376 }
 377
 378 extern uint32_t lookup_table_pack4[];
 379 template <class B>
 380 inline void
 381 templ_pack_writeptr<B>::pack4(uint64_t v)
 382 {
 383   uint bits= COUNT_LEADING_ZEROS_64(v | 1);
 384   uint32_t lookup_word= lookup_table_pack4[bits];
 385   uint64_t tag_bits= LOOKUP_PACK_GET_TAG_BITS(lookup_word);
 386   uint64_t tag_value= LOOKUP_PACK_GET_TAG_VALUE(lookup_word);
 387   uint64_t data_bits= LOOKUP_PACK_GET_DATA_BITS(lookup_word);
 388   B::putbits(tag_bits, tag_value);
 389   B::putbits(data_bits, v);
 390 }
 391
 392 extern uint32_t lookup_table_unpack4[];
 393 template <class B>
 394 inline uint64_t
 395 templ_pack_ptr<B>::unpack4()
 396 {
 397   uint prefix= B::getbits(3);
 398   uint32_t lookup_word = lookup_table_unpack4[prefix];
 399   uint64_t bits= LOOKUP_UNPACK_GET_BITS(lookup_word);
 400   uint64_t or_value= LOOKUP_UNPACK_GET_OR_VALUE(lookup_word);
 401 #ifndef WORDS_BIGENDIAN
 402   uint64_t lshift= LOOKUP_UNPACK_GET_LSHIFT(lookup_word);
 403 #endif
 404   return ( B::getbits(bits)
 405 #ifndef WORDS_BIGENDIAN
 406            << lshift
 407 #endif
 408          ) | or_value;
 409 }
 410
 411 #undef LOOKUP_PACK_GET_TAG_BITS
 412 #undef LOOKUP_PACK_GET_TAG_VALUE
 413 #undef LOOKUP_PACK_GET_DATA_BITS
 414
 415 #undef LOOKUP_UNPACK_GET_BITS
 416 #undef LOOKUP_UNPACK_GET_OR_VALUE
 417 #undef LOOKUP_UNPACK_GET_LSHIFT
 418
 419 /*
 420   Try hard to reduce the length of the dependency chain:
 421    - Don't use mul/div, as they have high latency.
 422    - Very simple computation of first part of value (3 ops) to avoid
 423      initial stall.
 424    - Compute second part independently from first for parallelism.
 425
 426   Method: 4 bits of header, 4, 8, 12, ..., 64 bits of data, total 8-68 bits.
 427
 428   This is significantly faster (maybe 33% or so) than pack[1-4].
 429 */
 430 template <class B>
 431 inline void
 432 templ_pack_writeptr<B>::pack5(uint64_t v)
 433 {
 434   uint64_t highest_bit;
 435   asm( "bsrq %1, %0" : "=r" (highest_bit) : "r" (v|1) : "cc");
 436   uint64_t bits1= highest_bit>>2;
 437   uint64_t bits2= (highest_bit & 0x3c) + 4;
 438   B::putbits(4, bits1);
 439   B::putbits(bits2, v);
 440 }
 441
 442 template <class B>
 443 inline uint64_t
 444 templ_pack_ptr<B>::unpack5()
 445 {
 446   uint64_t n= B::getbits(4);
 447   return B::getbits((n*4) + 4);
 448 }
 449
 450 /*
 451   Encoding signed numbers as unsigned for pack/unpack, with small number of
 452   significant digits for numbers of small magnitude.
 453
 454   Method is to store the sign bit in bit 0, and in bits 1-63 store x (for
 455   non-negative x) respectively -(x+1) (for negative x).
 456
 457   The asymmetry allows representing _all_ numbers exactly and avoids
 458   representing -0.
 459
 460   Binary encoding of 0, 1, 2, 3: 000, 010, 100, 110
 461   Binary encoding of -1, -2, -3: 001, 011, 101
 462   Decoded values by increasing encoding: 0, -1, 1, -2, 2, -3, 3, ...
 463
 464   The encoding of signed i as unsigned u and vise versa can be descripted thus:
 465
 466     u = i >= 0 ? (i << 1) : (1 | ((-(i+1))<<1))
 467     i = (u & 1) ? (-(u>>1) - 1) : (u>>1)
 468
 469   However, the implementation uses clever bit manipulations to get the same
 470   results with fewer operations and no conditionals. The dependency chain
 471   length for encode is only 4, and for decode 3 operations.
 472
 473   The main idea is that for signed number i, (i>>63) is either 0 or
 474   0xffffffffffffffff depending on sign; and (i^0xffffffffffffffff) = -(i+1).
 475
 476   ToDo: Hm, signed shift right is actually implementation dependent in
 477   C/C++ :-(. So need some port/*.h to replace (i>>63) with -(i < 0).
 478 */
 479 template <class B>
 480 inline uint64_t
 481 templ_pack_writeptr<B>::signed_encode(int64_t i)
 482 {
 483   return (((uint64_t)i ^ (uint64_t)(i>>63)) << 1) | (uint64_t)i >> 63;
 484 }
 485
 486 template <class B>
 487 inline int64_t
 488 templ_pack_writeptr<B>::signed_decode(uint64_t u)
 489 {
 490   return (int64_t) ( (u >> 1) ^ (uint64_t)((int64_t)(u<<63) >> 63) );
 491 }
 492
 493
 494
 495 /*
 496   Class for stream writing bit-aligned values of arbitrary bit-size <= 64.
 497   This class is optimized for writing new data sequentially, overwriting
 498   any previous values. This way it does not need to to any loads from the
 499   destination buffer at all.
 500 */
 501 class pack_overwriteptr_base {
 502  private:
 503   uint64_t *const base;
 504   uint64_t v;
 505   uint64_t i;
 506   uint64_t bitpos;
 507
 508  public:
 509   pack_overwriteptr_base(uint64_t *_base);
 510   pack_overwriteptr_base(uint64_t *_base, uint64_t start_bitpos);
 511
 512   /*
 513     Write bits into buffer.
 514     It is necessary to call flush() after the last write to ensure that any
 515     final fractional 64-bit destination word is stored into buffer. This way
 516     the putbits() call is faster since it does not need to execute a store
 517     at every invocation.
 518   */
 519   void putbits(uint64_t numbits, uint64_t data);
 520   /* Special version to optimize writing a full 64-bit word. */
 521   void putbits64(uint64_t data);
 522   void flush();
 523   uint64_t used_words() { return i + (bitpos > 0); }
 524   uint64_t used_bits() { return (i<<6) + bitpos; }
 525 };
 526
 527 typedef templ_pack_writeptr<pack_overwriteptr_base> pack_overwriteptr;
 528
 529 inline
 530 pack_overwriteptr_base::pack_overwriteptr_base(uint64_t *_base) :
 531   base(_base), v(0), i(0), bitpos(0)
 532 {
 533 }
 534
 535 inline
 536 pack_overwriteptr_base::pack_overwriteptr_base(uint64_t *_base,
 537                                                uint64_t start_bitpos) :
 538   base(_base), bitpos(start_bitpos & 0x3f)
 539 {
 540   i= start_bitpos >> 6;
 541   if (bitpos > 0)
 542   {
 543     v= base[i] & (~(uint64_t)0 >> (64 - bitpos));
 544   }
 545   else
 546   {
 547     v= 0;
 548   }
 549 }
 550
 551 inline void
 552 pack_overwriteptr_base::putbits(uint64_t numbits, uint64_t data)
 553 {
 554   uint64_t old_bitpos= bitpos;
 555   v|= data << old_bitpos;
 556   bitpos= old_bitpos + numbits;
 557   if (bitpos >= 64)
 558   {
 559 #ifdef NONTEMPORAL_EXPERIMENT
 560     STORE_NON_TEMPORAL_64(base + i, v);
 561     i++;
 562 #else
 563     base[i++]= v;
 564 #endif
 565     bitpos-= 64;
 566     if (likely(!(numbits == 64 && old_bitpos == 0)))
 567       v= data >> (64 - old_bitpos);
 568     else
 569       v= 0;
 570   }
 571 }
 572
 573 inline void
 574 pack_overwriteptr_base::putbits64(uint64_t data)
 575 {
 576   v|= data << bitpos;
 577 #ifdef NONTEMPORAL_EXPERIMENT
 578   STORE_NON_TEMPORAL_64(base + i, v);
 579   i++;
 580 #else
 581   base[i++]= v;
 582 #endif
 583   if (bitpos != 0)
 584     v= data >> (64 - bitpos);
 585   else
 586     v= 0;
 587 }
 588
 589 inline void
 590 pack_overwriteptr_base::flush()
 591 {
 592   if (bitpos > 0) {
 593 #ifdef NONTEMPORAL_EXPERIMENT
 594     STORE_NON_TEMPORAL_64(base + i, v);
 595 #else
 596     base[i]= v;
 597 #endif
 598   }
 599 }