First draft of performance test SQL schema.
[beedb.git] / exp / packed.h
blobc654a09445e0fcc67baf7b6f63e563557de1c312
1 /*
2 Copyright 2008 Kristian Nielsen
4 This file is part of BeeDB.
6 Foobar is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 2 of the License, or
9 (at your option) any later version.
11 Foobar is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Foobar. If not, see <http://www.gnu.org/licenses/>.
21 Try for a version which shifts the cached 64-bit word in-place, making
22 available the result word without having to compute a shift first,
23 hopefully reducing the length of the dependency chain.
25 Also use uint64_t throughout, seems to sometimes save the odd instruction
26 otherwise needed for sign extension.
29 #include "bitop.h"
32 This was an experiment to pre-fetch one 64-bit word ahead.
33 However, in addition to a nasty requirement to keep 8 bytes of extra readable
34 memory at the end of data, it also turns out to be about 3% slower!
36 // #define PREFETCH_EXPERIMENT
39 An experiment with using non-temporal writes in pack_overwriteptr.
41 This avoid loading the memory being written into cache.
43 //#define NONTEMPORAL_EXPERIMENT
45 class pack_ptr_base {
46 private:
47 /* Base pointer, never modified. */
48 uint64_t * const base;
49 /* Current value, all but lower BITSLEFT bits are zero. */
50 uint64_t v;
51 #ifdef PREFETCH_EXPERIMENT
52 uint64_t v_next;
53 #endif
54 /* Index into BASE of next word to fetch. */
55 uint64_t i;
56 /* Number of bits left in current word V. */
57 uint64_t bitsleft;
59 public:
60 pack_ptr_base(uint64_t *_base);
62 This constructor sets up to start reading from _bit_ position relative to
63 BASE. Ie. we might pass start_bitpos=100 to skip the first 5 20-bit words.
65 pack_ptr_base(uint64_t *_base, uint64_t start_bitpos);
67 void seek(uint64_t bitpos);
68 uint64_t getbits(uint64_t numbits);
70 This method needs to load + store the value at every invocation, due to
71 possibility of seek()/getbits() intermixed with putbits().
73 For more efficient multiple putbits() in serial, use pack_writeptr or
74 pack_overwriteptr.
76 void putbits(uint64_t numbits, uint64_t data);
77 /* More efficient versions for when numbits==64. */
78 uint64_t getbits64();
79 void putbits64(uint64_t data);
81 uint64_t used_words() { return i; }
82 uint64_t used_bits() { return (i<<6) - bitsleft; }
86 inline
87 pack_ptr_base::pack_ptr_base(uint64_t *_base) :
88 base(_base), v(0), i(0), bitsleft(0)
90 #ifdef PREFETCH_EXPERIMENT
91 v_next= base[i++];
92 #endif
94 We do not pre-load the first word, as we use the invariant (bitsleft < 64)
95 to avoid a special case for undefined shift-by-64-bit.
99 inline
100 pack_ptr_base::pack_ptr_base(uint64_t *_base, uint64_t start_bitpos) :
101 base(_base)
103 seek(start_bitpos);
106 inline void
107 pack_ptr_base::seek(uint64_t bitpos)
110 Since we cannot have bitsleft==64, in the case where we start on a word
111 boundary we need to point to the previous word and set bitsleft=0.
113 uint64_t fraction= bitpos & 63;
114 i= bitpos >> 6;
115 if (likely(fraction > 0))
117 bitsleft= 64 - fraction;
118 v= base[i++] >> fraction;
120 else
122 bitsleft= 0;
123 v= 0;
125 #ifdef PREFETCH_EXPERIMENT
126 v_next= base[i++];
127 #endif
130 inline uint64_t
131 pack_ptr_base::getbits(uint64_t numbits)
133 if (bitsleft >= numbits)
135 uint64_t r= v & (~(uint64_t)0 >> (64 - numbits));
137 The value of bitsleft is at most 63, so for numbits==64 we never get
138 here with an otherwise undefined shift-by-64-bit operation.
140 v>>= numbits;
141 bitsleft-= numbits;
142 return r;
144 else
146 uint64_t r= v;
147 #ifdef PREFETCH_EXPERIMENT
148 v= v_next;
149 v_next= base[i++];
150 #else
151 v= base[i++];
152 #endif
153 r= (r | (v << bitsleft)) & (~(uint64_t)0 >> (64 - numbits));
155 Guard against undefined shift-by-64-bit.
156 In many cases the numbits==64 condition will be statically determined
157 by the compiler, so no branch will be generated, or in any case the CPU
158 should be safe to predict this conditional as true.
160 if (likely(!(numbits == 64 && bitsleft == 0)))
161 v>>= numbits - bitsleft;
162 else
163 v= 0;
164 bitsleft= bitsleft + (64 - numbits);
165 return r;
169 inline uint64_t
170 pack_ptr_base::getbits64()
172 uint64_t r= v;
173 #ifdef PREFETCH_EXPERIMENT
174 v= v_next;
175 v_next= base[i++];
176 #else
177 v= base[i++];
178 #endif
179 r|= v << bitsleft;
180 /* Guard against undefined shift-by-64-bit. */
181 if (bitsleft != 0)
182 v>>= 64 - bitsleft;
183 else
184 v= 0;
185 return r;
188 inline void
189 pack_ptr_base::putbits(uint64_t numbits, uint64_t data)
191 if (bitsleft > 0)
193 uint64_t val= base[i-1];
194 uint64_t mask= (~(uint64_t)0 >> (64 - numbits)) << (64 - bitsleft);
195 val= (val & ~mask) | (data << (64 - bitsleft));
196 if (bitsleft >= numbits)
198 bitsleft-= numbits;
199 v>>= numbits;
201 else
203 uint64_t rest= numbits - bitsleft;
204 bitsleft+= 64 - numbits;
205 mask= ~(uint64_t)0 >> bitsleft;
206 v= (base[i] & ~mask) | (data >> rest);
207 base[i++]= v;
208 v>>= rest;
211 else
213 uint64_t mask= ~(uint64_t)0 >> (64 - numbits);
214 v= (base[i] & ~mask) | data;
215 base[i++]= v;
216 if (numbits != 64)
217 v>>= numbits;
218 else
219 v= 0;
220 bitsleft= 64 - numbits;
224 inline void
225 pack_ptr_base::putbits64(uint64_t data)
227 if (bitsleft > 0)
229 uint64_t val= base[i-1];
230 uint64_t mask= ~(uint64_t)0 << (64 - bitsleft);
231 val= (val & ~mask) | (data << (64 - bitsleft));
232 uint64_t rest= 64 - bitsleft;
233 mask= ~(uint64_t)0 >> bitsleft;
234 v= (base[i] & ~mask) | (data >> rest);
235 base[i++]= v;
236 v>>= rest;
238 else
240 base[i++]= data;
241 v= 0;
247 This template adds pack() methods to a base class that supports putbits().
249 template <class B>
250 class templ_pack_writeptr : public B {
251 public:
252 templ_pack_writeptr(uint64_t *base) : B(base) { }
254 This constructor with specified start position will only be available in
255 some instantiations.
257 templ_pack_writeptr(uint64_t *base, uint64_t start_bitpos) :
258 B(base, start_bitpos) { }
260 void pack1(uint64_t v);
261 void pack2(uint64_t v);
262 void pack3(uint64_t v);
263 void pack4(uint64_t v);
264 void pack5(uint64_t v);
267 These are used to efficiently pack/unpack signed numbers.
268 They convert to a representation where numbers of small magnitude (both
269 negative and positive numbers) use few significant bits (most leading bits
270 are zero).
272 static uint64_t signed_encode(int64_t i);
273 static int64_t signed_decode(uint64_t u);
277 This template adds both pack() and unpack() to a base class that supports
278 both putbits() and getbits().
280 template <class B>
281 class templ_pack_ptr : public templ_pack_writeptr<B> {
282 public:
283 templ_pack_ptr(uint64_t *base) : templ_pack_writeptr<B>(base) { }
284 templ_pack_ptr(uint64_t *base, uint64_t start_bitpos) :
285 templ_pack_writeptr<B>(base, start_bitpos) { }
286 uint64_t unpack1();
287 uint64_t unpack2();
288 uint64_t unpack3();
289 uint64_t unpack4();
290 uint64_t unpack5();
293 typedef templ_pack_ptr<pack_ptr_base> pack_ptr;
296 template <class B>
297 inline void
298 templ_pack_writeptr<B>::pack1(uint64_t v)
301 Compute the 3-bit header counter C, where we store C*9+1 bits.
302 We have C = ((#bits - 1) + 8) / 9 [rounding down].
303 And #bits = 64 - #leading zeroes, and 64 - 1 + 8 == 71.
304 We use |1 to be sure to get at least a size of one bit (__builtin_clzl is
305 undefined for argument of zero).
307 uint64_t bits= (71 - COUNT_LEADING_ZEROS_64(v | 1))/9;
308 B::putbits(3, bits);
309 B::putbits(bits*9+1, v);
312 template <class B>
313 inline uint64_t
314 templ_pack_ptr<B>::unpack1()
316 uint64_t n= B::getbits(3);
317 return B::getbits(n*9+1);
320 template <class B>
321 inline void
322 templ_pack_writeptr<B>::pack2(uint64_t v)
324 uint64_t bits= (79 - COUNT_LEADING_ZEROS_64(v | 1))/20;
325 B::putbits(2, bits);
326 B::putbits(bits*20+4, v);
329 template <class B>
330 inline uint64_t
331 templ_pack_ptr<B>::unpack2()
333 uint64_t n= B::getbits(2);
334 return B::getbits(n*20+4);
337 #define LOOKUP_PACK_GET_TAG_BITS(v) ( (v) & 0xffff )
338 #define LOOKUP_PACK_GET_TAG_VALUE(v) ( (v) >> 24 )
339 #define LOOKUP_PACK_GET_DATA_BITS(v) ( ((v) >> 16) & 0xff )
341 #define LOOKUP_UNPACK_GET_BITS(v) ( (v) & 0xffff )
342 #define LOOKUP_UNPACK_GET_OR_VALUE(v) ( (v) >> 24 )
343 #define LOOKUP_UNPACK_GET_LSHIFT(v) ( ((v) >> 16) & 0xff )
345 extern uint32_t lookup_table_pack3[];
346 template <class B>
347 inline void
348 templ_pack_writeptr<B>::pack3(uint64_t v)
350 uint bits= COUNT_LEADING_ZEROS_64(v | 1);
351 uint32_t lookup_word= lookup_table_pack3[bits];
352 uint64_t tag_bits= LOOKUP_PACK_GET_TAG_BITS(lookup_word);
353 uint64_t tag_value= LOOKUP_PACK_GET_TAG_VALUE(lookup_word);
354 uint64_t data_bits= LOOKUP_PACK_GET_DATA_BITS(lookup_word);
355 B::putbits(tag_bits, tag_value);
356 B::putbits(data_bits, v);
359 extern uint32_t lookup_table_unpack3[];
360 template <class B>
361 inline uint64_t
362 templ_pack_ptr<B>::unpack3()
364 uint prefix= B::getbits(3);
365 uint32_t lookup_word = lookup_table_unpack3[prefix];
366 uint64_t bits= LOOKUP_UNPACK_GET_BITS(lookup_word);
367 uint64_t or_value= LOOKUP_UNPACK_GET_OR_VALUE(lookup_word);
368 #ifndef WORDS_BIGENDIAN
369 uint64_t lshift= LOOKUP_UNPACK_GET_LSHIFT(lookup_word);
370 #endif
371 return ( B::getbits(bits)
372 #ifndef WORDS_BIGENDIAN
373 << lshift
374 #endif
375 ) | or_value;
378 extern uint32_t lookup_table_pack4[];
379 template <class B>
380 inline void
381 templ_pack_writeptr<B>::pack4(uint64_t v)
383 uint bits= COUNT_LEADING_ZEROS_64(v | 1);
384 uint32_t lookup_word= lookup_table_pack4[bits];
385 uint64_t tag_bits= LOOKUP_PACK_GET_TAG_BITS(lookup_word);
386 uint64_t tag_value= LOOKUP_PACK_GET_TAG_VALUE(lookup_word);
387 uint64_t data_bits= LOOKUP_PACK_GET_DATA_BITS(lookup_word);
388 B::putbits(tag_bits, tag_value);
389 B::putbits(data_bits, v);
392 extern uint32_t lookup_table_unpack4[];
393 template <class B>
394 inline uint64_t
395 templ_pack_ptr<B>::unpack4()
397 uint prefix= B::getbits(3);
398 uint32_t lookup_word = lookup_table_unpack4[prefix];
399 uint64_t bits= LOOKUP_UNPACK_GET_BITS(lookup_word);
400 uint64_t or_value= LOOKUP_UNPACK_GET_OR_VALUE(lookup_word);
401 #ifndef WORDS_BIGENDIAN
402 uint64_t lshift= LOOKUP_UNPACK_GET_LSHIFT(lookup_word);
403 #endif
404 return ( B::getbits(bits)
405 #ifndef WORDS_BIGENDIAN
406 << lshift
407 #endif
408 ) | or_value;
411 #undef LOOKUP_PACK_GET_TAG_BITS
412 #undef LOOKUP_PACK_GET_TAG_VALUE
413 #undef LOOKUP_PACK_GET_DATA_BITS
415 #undef LOOKUP_UNPACK_GET_BITS
416 #undef LOOKUP_UNPACK_GET_OR_VALUE
417 #undef LOOKUP_UNPACK_GET_LSHIFT
420 Try hard to reduce the length of the dependency chain:
421 - Don't use mul/div, as they have high latency.
422 - Very simple computation of first part of value (3 ops) to avoid
423 initial stall.
424 - Compute second part independently from first for parallelism.
426 Method: 4 bits of header, 4, 8, 12, ..., 64 bits of data, total 8-68 bits.
428 This is significantly faster (maybe 33% or so) than pack[1-4].
430 template <class B>
431 inline void
432 templ_pack_writeptr<B>::pack5(uint64_t v)
434 uint64_t highest_bit;
435 asm( "bsrq %1, %0" : "=r" (highest_bit) : "r" (v|1) : "cc");
436 uint64_t bits1= highest_bit>>2;
437 uint64_t bits2= (highest_bit & 0x3c) + 4;
438 B::putbits(4, bits1);
439 B::putbits(bits2, v);
442 template <class B>
443 inline uint64_t
444 templ_pack_ptr<B>::unpack5()
446 uint64_t n= B::getbits(4);
447 return B::getbits((n*4) + 4);
451 Encoding signed numbers as unsigned for pack/unpack, with small number of
452 significant digits for numbers of small magnitude.
454 Method is to store the sign bit in bit 0, and in bits 1-63 store x (for
455 non-negative x) respectively -(x+1) (for negative x).
457 The asymmetry allows representing _all_ numbers exactly and avoids
458 representing -0.
460 Binary encoding of 0, 1, 2, 3: 000, 010, 100, 110
461 Binary encoding of -1, -2, -3: 001, 011, 101
462 Decoded values by increasing encoding: 0, -1, 1, -2, 2, -3, 3, ...
464 The encoding of signed i as unsigned u and vise versa can be descripted thus:
466 u = i >= 0 ? (i << 1) : (1 | ((-(i+1))<<1))
467 i = (u & 1) ? (-(u>>1) - 1) : (u>>1)
469 However, the implementation uses clever bit manipulations to get the same
470 results with fewer operations and no conditionals. The dependency chain
471 length for encode is only 4, and for decode 3 operations.
473 The main idea is that for signed number i, (i>>63) is either 0 or
474 0xffffffffffffffff depending on sign; and (i^0xffffffffffffffff) = -(i+1).
476 ToDo: Hm, signed shift right is actually implementation dependent in
477 C/C++ :-(. So need some port/*.h to replace (i>>63) with -(i < 0).
479 template <class B>
480 inline uint64_t
481 templ_pack_writeptr<B>::signed_encode(int64_t i)
483 return (((uint64_t)i ^ (uint64_t)(i>>63)) << 1) | (uint64_t)i >> 63;
486 template <class B>
487 inline int64_t
488 templ_pack_writeptr<B>::signed_decode(uint64_t u)
490 return (int64_t) ( (u >> 1) ^ (uint64_t)((int64_t)(u<<63) >> 63) );
496 Class for stream writing bit-aligned values of arbitrary bit-size <= 64.
497 This class is optimized for writing new data sequentially, overwriting
498 any previous values. This way it does not need to to any loads from the
499 destination buffer at all.
501 class pack_overwriteptr_base {
502 private:
503 uint64_t *const base;
504 uint64_t v;
505 uint64_t i;
506 uint64_t bitpos;
508 public:
509 pack_overwriteptr_base(uint64_t *_base);
510 pack_overwriteptr_base(uint64_t *_base, uint64_t start_bitpos);
513 Write bits into buffer.
514 It is necessary to call flush() after the last write to ensure that any
515 final fractional 64-bit destination word is stored into buffer. This way
516 the putbits() call is faster since it does not need to execute a store
517 at every invocation.
519 void putbits(uint64_t numbits, uint64_t data);
520 /* Special version to optimize writing a full 64-bit word. */
521 void putbits64(uint64_t data);
522 void flush();
523 uint64_t used_words() { return i + (bitpos > 0); }
524 uint64_t used_bits() { return (i<<6) + bitpos; }
527 typedef templ_pack_writeptr<pack_overwriteptr_base> pack_overwriteptr;
529 inline
530 pack_overwriteptr_base::pack_overwriteptr_base(uint64_t *_base) :
531 base(_base), v(0), i(0), bitpos(0)
535 inline
536 pack_overwriteptr_base::pack_overwriteptr_base(uint64_t *_base,
537 uint64_t start_bitpos) :
538 base(_base), bitpos(start_bitpos & 0x3f)
540 i= start_bitpos >> 6;
541 if (bitpos > 0)
543 v= base[i] & (~(uint64_t)0 >> (64 - bitpos));
545 else
547 v= 0;
551 inline void
552 pack_overwriteptr_base::putbits(uint64_t numbits, uint64_t data)
554 uint64_t old_bitpos= bitpos;
555 v|= data << old_bitpos;
556 bitpos= old_bitpos + numbits;
557 if (bitpos >= 64)
559 #ifdef NONTEMPORAL_EXPERIMENT
560 STORE_NON_TEMPORAL_64(base + i, v);
561 i++;
562 #else
563 base[i++]= v;
564 #endif
565 bitpos-= 64;
566 if (likely(!(numbits == 64 && old_bitpos == 0)))
567 v= data >> (64 - old_bitpos);
568 else
569 v= 0;
573 inline void
574 pack_overwriteptr_base::putbits64(uint64_t data)
576 v|= data << bitpos;
577 #ifdef NONTEMPORAL_EXPERIMENT
578 STORE_NON_TEMPORAL_64(base + i, v);
579 i++;
580 #else
581 base[i++]= v;
582 #endif
583 if (bitpos != 0)
584 v= data >> (64 - bitpos);
585 else
586 v= 0;
589 inline void
590 pack_overwriteptr_base::flush()
592 if (bitpos > 0) {
593 #ifdef NONTEMPORAL_EXPERIMENT
594 STORE_NON_TEMPORAL_64(base + i, v);
595 #else
596 base[i]= v;
597 #endif