2 Copyright 2008 Kristian Nielsen
4 This file is part of BeeDB.
6 Foobar is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 2 of the License, or
9 (at your option) any later version.
11 Foobar is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with Foobar. If not, see <http://www.gnu.org/licenses/>.
21 Try for a version which shifts the cached 64-bit word in-place, making
22 available the result word without having to compute a shift first,
23 hopefully reducing the length of the dependency chain.
25 Also use uint64_t throughout, seems to sometimes save the odd instruction
26 otherwise needed for sign extension.
32 This was an experiment to pre-fetch one 64-bit word ahead.
33 However, in addition to a nasty requirement to keep 8 bytes of extra readable
34 memory at the end of data, it also turns out to be about 3% slower!
36 // #define PREFETCH_EXPERIMENT
39 An experiment with using non-temporal writes in pack_overwriteptr.
41 This avoid loading the memory being written into cache.
43 //#define NONTEMPORAL_EXPERIMENT
47 /* Base pointer, never modified. */
48 uint64_t * const base
;
49 /* Current value, all but lower BITSLEFT bits are zero. */
51 #ifdef PREFETCH_EXPERIMENT
54 /* Index into BASE of next word to fetch. */
56 /* Number of bits left in current word V. */
60 pack_ptr_base(uint64_t *_base
);
62 This constructor sets up to start reading from _bit_ position relative to
63 BASE. Ie. we might pass start_bitpos=100 to skip the first 5 20-bit words.
65 pack_ptr_base(uint64_t *_base
, uint64_t start_bitpos
);
67 void seek(uint64_t bitpos
);
68 uint64_t getbits(uint64_t numbits
);
70 This method needs to load + store the value at every invocation, due to
71 possibility of seek()/getbits() intermixed with putbits().
73 For more efficient multiple putbits() in serial, use pack_writeptr or
76 void putbits(uint64_t numbits
, uint64_t data
);
77 /* More efficient versions for when numbits==64. */
79 void putbits64(uint64_t data
);
81 uint64_t used_words() { return i
; }
82 uint64_t used_bits() { return (i
<<6) - bitsleft
; }
87 pack_ptr_base::pack_ptr_base(uint64_t *_base
) :
88 base(_base
), v(0), i(0), bitsleft(0)
90 #ifdef PREFETCH_EXPERIMENT
94 We do not pre-load the first word, as we use the invariant (bitsleft < 64)
95 to avoid a special case for undefined shift-by-64-bit.
100 pack_ptr_base::pack_ptr_base(uint64_t *_base
, uint64_t start_bitpos
) :
107 pack_ptr_base::seek(uint64_t bitpos
)
110 Since we cannot have bitsleft==64, in the case where we start on a word
111 boundary we need to point to the previous word and set bitsleft=0.
113 uint64_t fraction
= bitpos
& 63;
115 if (likely(fraction
> 0))
117 bitsleft
= 64 - fraction
;
118 v
= base
[i
++] >> fraction
;
125 #ifdef PREFETCH_EXPERIMENT
131 pack_ptr_base::getbits(uint64_t numbits
)
133 if (bitsleft
>= numbits
)
135 uint64_t r
= v
& (~(uint64_t)0 >> (64 - numbits
));
137 The value of bitsleft is at most 63, so for numbits==64 we never get
138 here with an otherwise undefined shift-by-64-bit operation.
147 #ifdef PREFETCH_EXPERIMENT
153 r
= (r
| (v
<< bitsleft
)) & (~(uint64_t)0 >> (64 - numbits
));
155 Guard against undefined shift-by-64-bit.
156 In many cases the numbits==64 condition will be statically determined
157 by the compiler, so no branch will be generated, or in any case the CPU
158 should be safe to predict this conditional as true.
160 if (likely(!(numbits
== 64 && bitsleft
== 0)))
161 v
>>= numbits
- bitsleft
;
164 bitsleft
= bitsleft
+ (64 - numbits
);
170 pack_ptr_base::getbits64()
173 #ifdef PREFETCH_EXPERIMENT
180 /* Guard against undefined shift-by-64-bit. */
189 pack_ptr_base::putbits(uint64_t numbits
, uint64_t data
)
193 uint64_t val
= base
[i
-1];
194 uint64_t mask
= (~(uint64_t)0 >> (64 - numbits
)) << (64 - bitsleft
);
195 val
= (val
& ~mask
) | (data
<< (64 - bitsleft
));
196 if (bitsleft
>= numbits
)
203 uint64_t rest
= numbits
- bitsleft
;
204 bitsleft
+= 64 - numbits
;
205 mask
= ~(uint64_t)0 >> bitsleft
;
206 v
= (base
[i
] & ~mask
) | (data
>> rest
);
213 uint64_t mask
= ~(uint64_t)0 >> (64 - numbits
);
214 v
= (base
[i
] & ~mask
) | data
;
220 bitsleft
= 64 - numbits
;
225 pack_ptr_base::putbits64(uint64_t data
)
229 uint64_t val
= base
[i
-1];
230 uint64_t mask
= ~(uint64_t)0 << (64 - bitsleft
);
231 val
= (val
& ~mask
) | (data
<< (64 - bitsleft
));
232 uint64_t rest
= 64 - bitsleft
;
233 mask
= ~(uint64_t)0 >> bitsleft
;
234 v
= (base
[i
] & ~mask
) | (data
>> rest
);
247 This template adds pack() methods to a base class that supports putbits().
250 class templ_pack_writeptr
: public B
{
252 templ_pack_writeptr(uint64_t *base
) : B(base
) { }
254 This constructor with specified start position will only be available in
257 templ_pack_writeptr(uint64_t *base
, uint64_t start_bitpos
) :
258 B(base
, start_bitpos
) { }
260 void pack1(uint64_t v
);
261 void pack2(uint64_t v
);
262 void pack3(uint64_t v
);
263 void pack4(uint64_t v
);
264 void pack5(uint64_t v
);
267 These are used to efficiently pack/unpack signed numbers.
268 They convert to a representation where numbers of small magnitude (both
269 negative and positive numbers) use few significant bits (most leading bits
272 static uint64_t signed_encode(int64_t i
);
273 static int64_t signed_decode(uint64_t u
);
277 This template adds both pack() and unpack() to a base class that supports
278 both putbits() and getbits().
281 class templ_pack_ptr
: public templ_pack_writeptr
<B
> {
283 templ_pack_ptr(uint64_t *base
) : templ_pack_writeptr
<B
>(base
) { }
284 templ_pack_ptr(uint64_t *base
, uint64_t start_bitpos
) :
285 templ_pack_writeptr
<B
>(base
, start_bitpos
) { }
293 typedef templ_pack_ptr
<pack_ptr_base
> pack_ptr
;
298 templ_pack_writeptr
<B
>::pack1(uint64_t v
)
301 Compute the 3-bit header counter C, where we store C*9+1 bits.
302 We have C = ((#bits - 1) + 8) / 9 [rounding down].
303 And #bits = 64 - #leading zeroes, and 64 - 1 + 8 == 71.
304 We use |1 to be sure to get at least a size of one bit (__builtin_clzl is
305 undefined for argument of zero).
307 uint64_t bits
= (71 - COUNT_LEADING_ZEROS_64(v
| 1))/9;
309 B::putbits(bits
*9+1, v
);
314 templ_pack_ptr
<B
>::unpack1()
316 uint64_t n
= B::getbits(3);
317 return B::getbits(n
*9+1);
322 templ_pack_writeptr
<B
>::pack2(uint64_t v
)
324 uint64_t bits
= (79 - COUNT_LEADING_ZEROS_64(v
| 1))/20;
326 B::putbits(bits
*20+4, v
);
331 templ_pack_ptr
<B
>::unpack2()
333 uint64_t n
= B::getbits(2);
334 return B::getbits(n
*20+4);
337 #define LOOKUP_PACK_GET_TAG_BITS(v) ( (v) & 0xffff )
338 #define LOOKUP_PACK_GET_TAG_VALUE(v) ( (v) >> 24 )
339 #define LOOKUP_PACK_GET_DATA_BITS(v) ( ((v) >> 16) & 0xff )
341 #define LOOKUP_UNPACK_GET_BITS(v) ( (v) & 0xffff )
342 #define LOOKUP_UNPACK_GET_OR_VALUE(v) ( (v) >> 24 )
343 #define LOOKUP_UNPACK_GET_LSHIFT(v) ( ((v) >> 16) & 0xff )
345 extern uint32_t lookup_table_pack3
[];
348 templ_pack_writeptr
<B
>::pack3(uint64_t v
)
350 uint bits
= COUNT_LEADING_ZEROS_64(v
| 1);
351 uint32_t lookup_word
= lookup_table_pack3
[bits
];
352 uint64_t tag_bits
= LOOKUP_PACK_GET_TAG_BITS(lookup_word
);
353 uint64_t tag_value
= LOOKUP_PACK_GET_TAG_VALUE(lookup_word
);
354 uint64_t data_bits
= LOOKUP_PACK_GET_DATA_BITS(lookup_word
);
355 B::putbits(tag_bits
, tag_value
);
356 B::putbits(data_bits
, v
);
359 extern uint32_t lookup_table_unpack3
[];
362 templ_pack_ptr
<B
>::unpack3()
364 uint prefix
= B::getbits(3);
365 uint32_t lookup_word
= lookup_table_unpack3
[prefix
];
366 uint64_t bits
= LOOKUP_UNPACK_GET_BITS(lookup_word
);
367 uint64_t or_value
= LOOKUP_UNPACK_GET_OR_VALUE(lookup_word
);
368 #ifndef WORDS_BIGENDIAN
369 uint64_t lshift
= LOOKUP_UNPACK_GET_LSHIFT(lookup_word
);
371 return ( B::getbits(bits
)
372 #ifndef WORDS_BIGENDIAN
378 extern uint32_t lookup_table_pack4
[];
381 templ_pack_writeptr
<B
>::pack4(uint64_t v
)
383 uint bits
= COUNT_LEADING_ZEROS_64(v
| 1);
384 uint32_t lookup_word
= lookup_table_pack4
[bits
];
385 uint64_t tag_bits
= LOOKUP_PACK_GET_TAG_BITS(lookup_word
);
386 uint64_t tag_value
= LOOKUP_PACK_GET_TAG_VALUE(lookup_word
);
387 uint64_t data_bits
= LOOKUP_PACK_GET_DATA_BITS(lookup_word
);
388 B::putbits(tag_bits
, tag_value
);
389 B::putbits(data_bits
, v
);
392 extern uint32_t lookup_table_unpack4
[];
395 templ_pack_ptr
<B
>::unpack4()
397 uint prefix
= B::getbits(3);
398 uint32_t lookup_word
= lookup_table_unpack4
[prefix
];
399 uint64_t bits
= LOOKUP_UNPACK_GET_BITS(lookup_word
);
400 uint64_t or_value
= LOOKUP_UNPACK_GET_OR_VALUE(lookup_word
);
401 #ifndef WORDS_BIGENDIAN
402 uint64_t lshift
= LOOKUP_UNPACK_GET_LSHIFT(lookup_word
);
404 return ( B::getbits(bits
)
405 #ifndef WORDS_BIGENDIAN
411 #undef LOOKUP_PACK_GET_TAG_BITS
412 #undef LOOKUP_PACK_GET_TAG_VALUE
413 #undef LOOKUP_PACK_GET_DATA_BITS
415 #undef LOOKUP_UNPACK_GET_BITS
416 #undef LOOKUP_UNPACK_GET_OR_VALUE
417 #undef LOOKUP_UNPACK_GET_LSHIFT
420 Try hard to reduce the length of the dependency chain:
421 - Don't use mul/div, as they have high latency.
422 - Very simple computation of first part of value (3 ops) to avoid
424 - Compute second part independently from first for parallelism.
426 Method: 4 bits of header, 4, 8, 12, ..., 64 bits of data, total 8-68 bits.
428 This is significantly faster (maybe 33% or so) than pack[1-4].
432 templ_pack_writeptr
<B
>::pack5(uint64_t v
)
434 uint64_t highest_bit
;
435 asm( "bsrq %1, %0" : "=r" (highest_bit
) : "r" (v
|1) : "cc");
436 uint64_t bits1
= highest_bit
>>2;
437 uint64_t bits2
= (highest_bit
& 0x3c) + 4;
438 B::putbits(4, bits1
);
439 B::putbits(bits2
, v
);
444 templ_pack_ptr
<B
>::unpack5()
446 uint64_t n
= B::getbits(4);
447 return B::getbits((n
*4) + 4);
451 Encoding signed numbers as unsigned for pack/unpack, with small number of
452 significant digits for numbers of small magnitude.
454 Method is to store the sign bit in bit 0, and in bits 1-63 store x (for
455 non-negative x) respectively -(x+1) (for negative x).
457 The asymmetry allows representing _all_ numbers exactly and avoids
460 Binary encoding of 0, 1, 2, 3: 000, 010, 100, 110
461 Binary encoding of -1, -2, -3: 001, 011, 101
462 Decoded values by increasing encoding: 0, -1, 1, -2, 2, -3, 3, ...
464 The encoding of signed i as unsigned u and vise versa can be descripted thus:
466 u = i >= 0 ? (i << 1) : (1 | ((-(i+1))<<1))
467 i = (u & 1) ? (-(u>>1) - 1) : (u>>1)
469 However, the implementation uses clever bit manipulations to get the same
470 results with fewer operations and no conditionals. The dependency chain
471 length for encode is only 4, and for decode 3 operations.
473 The main idea is that for signed number i, (i>>63) is either 0 or
474 0xffffffffffffffff depending on sign; and (i^0xffffffffffffffff) = -(i+1).
476 ToDo: Hm, signed shift right is actually implementation dependent in
477 C/C++ :-(. So need some port/*.h to replace (i>>63) with -(i < 0).
481 templ_pack_writeptr
<B
>::signed_encode(int64_t i
)
483 return (((uint64_t)i
^ (uint64_t)(i
>>63)) << 1) | (uint64_t)i
>> 63;
488 templ_pack_writeptr
<B
>::signed_decode(uint64_t u
)
490 return (int64_t) ( (u
>> 1) ^ (uint64_t)((int64_t)(u
<<63) >> 63) );
496 Class for stream writing bit-aligned values of arbitrary bit-size <= 64.
497 This class is optimized for writing new data sequentially, overwriting
498 any previous values. This way it does not need to to any loads from the
499 destination buffer at all.
501 class pack_overwriteptr_base
{
503 uint64_t *const base
;
509 pack_overwriteptr_base(uint64_t *_base
);
510 pack_overwriteptr_base(uint64_t *_base
, uint64_t start_bitpos
);
513 Write bits into buffer.
514 It is necessary to call flush() after the last write to ensure that any
515 final fractional 64-bit destination word is stored into buffer. This way
516 the putbits() call is faster since it does not need to execute a store
519 void putbits(uint64_t numbits
, uint64_t data
);
520 /* Special version to optimize writing a full 64-bit word. */
521 void putbits64(uint64_t data
);
523 uint64_t used_words() { return i
+ (bitpos
> 0); }
524 uint64_t used_bits() { return (i
<<6) + bitpos
; }
527 typedef templ_pack_writeptr
<pack_overwriteptr_base
> pack_overwriteptr
;
530 pack_overwriteptr_base::pack_overwriteptr_base(uint64_t *_base
) :
531 base(_base
), v(0), i(0), bitpos(0)
536 pack_overwriteptr_base::pack_overwriteptr_base(uint64_t *_base
,
537 uint64_t start_bitpos
) :
538 base(_base
), bitpos(start_bitpos
& 0x3f)
540 i
= start_bitpos
>> 6;
543 v
= base
[i
] & (~(uint64_t)0 >> (64 - bitpos
));
552 pack_overwriteptr_base::putbits(uint64_t numbits
, uint64_t data
)
554 uint64_t old_bitpos
= bitpos
;
555 v
|= data
<< old_bitpos
;
556 bitpos
= old_bitpos
+ numbits
;
559 #ifdef NONTEMPORAL_EXPERIMENT
560 STORE_NON_TEMPORAL_64(base
+ i
, v
);
566 if (likely(!(numbits
== 64 && old_bitpos
== 0)))
567 v
= data
>> (64 - old_bitpos
);
574 pack_overwriteptr_base::putbits64(uint64_t data
)
577 #ifdef NONTEMPORAL_EXPERIMENT
578 STORE_NON_TEMPORAL_64(base
+ i
, v
);
584 v
= data
>> (64 - bitpos
);
590 pack_overwriteptr_base::flush()
593 #ifdef NONTEMPORAL_EXPERIMENT
594 STORE_NON_TEMPORAL_64(base
+ i
, v
);