[ci] Enable IRC notifications from travis
[xapian.git] / xapian-core / common / bitstream.cc
blob8ae0f8778088ab4d703b0ef85db9b00e1cc04c64
1 /** @file bitstream.cc
2 * @brief Classes to encode/decode a bitstream.
3 */
4 /* Copyright (C) 2004,2005,2006,2008,2013,2014,2016,2017,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #include <config.h>
24 #include "bitstream.h"
26 #include <xapian/types.h>
28 #include "omassert.h"
30 #include <cmath>
31 #include <vector>
33 using namespace std;
35 #if HAVE_DECL___BUILTIN_CLZ && \
36 HAVE_DECL___BUILTIN_CLZL && \
37 HAVE_DECL___BUILTIN_CLZLL
38 inline int do_clz(unsigned value) { return __builtin_clz(value); }
40 inline int do_clz(unsigned long value) { return __builtin_clzl(value); }
42 inline int do_clz(unsigned long long value) { return __builtin_clzll(value); }
44 # define HAVE_DO_CLZ
45 #endif
47 // Highly optimised fls() implementation.
48 template<typename T>
49 inline int
50 highest_order_bit(T mask)
52 #ifdef HAVE_DO_CLZ
53 return mask ? sizeof(T) * 8 - do_clz(mask) : 0;
54 #else
55 static const unsigned char flstab[256] = {
56 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
57 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
58 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
59 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
60 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
61 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
62 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
63 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
64 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
65 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
66 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
67 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
68 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
69 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
70 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
71 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
74 int result = 0;
75 if (sizeof(T) > 4) {
76 if (mask >= 0x100000000ul) {
77 mask >>= 32;
78 result += 32;
81 if (mask >= 0x10000u) {
82 mask >>= 16;
83 result += 16;
85 if (mask >= 0x100u) {
86 mask >>= 8;
87 result += 8;
89 return result + flstab[mask];
90 #endif
93 namespace Xapian {
95 /// Shift left that's safe for shifts wider than the type.
96 template<typename T, typename U>
97 constexpr inline
98 T safe_shl(T x, U shift)
100 return (shift >= sizeof(T) * 8 ? 0 : x << shift);
103 void
104 BitWriter::encode(Xapian::termpos value, Xapian::termpos outof)
106 Assert(value < outof);
107 unsigned bits = highest_order_bit(outof - Xapian::termpos(1));
108 const Xapian::termpos spare = safe_shl(Xapian::termpos(1), bits) - outof;
109 if (spare) {
110 /* If we have spare values, we can use one fewer bit to encode some
111 * values. We shorten the values in the middle of the range, as
112 * testing (on positional data) shows this works best. "Managing
113 * Gigabytes" suggests reversing this for the lowest level and encoding
114 * the end values of the range shorter, which is contrary to our
115 * testing (MG is talking about posting lists, which probably have
116 * different characteristics).
118 * For example, if outof is 11, the codes emitted are:
120 * value output
121 * 0 0000
122 * 1 0001
123 * 2 0010
124 * 3 011
125 * 4 100
126 * 5 101
127 * 6 110
128 * 7 111
129 * 8 1000
130 * 9 1001
131 * 10 1010
133 * Note the LSB comes first in the bitstream, so these codes need to be
134 * suffix-free to be decoded.
136 const Xapian::termpos mid_start = (outof - spare) / 2;
137 if (value >= mid_start + spare) {
138 value = (value - (mid_start + spare)) |
139 (Xapian::termpos(1) << (bits - 1));
140 } else if (value >= mid_start) {
141 --bits;
145 if (bits + n_bits > sizeof(acc) * 8) {
146 // We need to write more bits than there's empty room for in
147 // the accumulator. So we arrange to shift out 8 bits, then
148 // adjust things so we're adding 8 fewer bits.
149 Assert(bits <= sizeof(acc) * 8);
150 acc |= (value << n_bits);
151 buf += char(acc);
152 acc >>= 8;
153 value >>= 8;
154 bits -= 8;
156 acc |= (value << n_bits);
157 n_bits += bits;
158 while (n_bits >= 8) {
159 buf += char(acc);
160 acc >>= 8;
161 n_bits -= 8;
165 void
166 BitWriter::encode_interpolative(const Xapian::VecCOW<Xapian::termpos> &pos, int j, int k)
168 // "Interpolative code" - for an algorithm description, see "Managing
169 // Gigabytes" - pages 126-127 in the second edition. You can probably
170 // view those pages in google books.
171 while (j + 1 < k) {
172 const Xapian::termpos mid = j + (k - j) / 2;
173 // Encode one out of (pos[k] - pos[j] + 1) values
174 // (less some at either end because we must be able to fit
175 // all the intervening pos in)
176 const Xapian::termpos outof = pos[k] - pos[j] + j - k + 1;
177 const Xapian::termpos lowest = pos[j] + mid - j;
178 encode(pos[mid] - lowest, outof);
179 encode_interpolative(pos, j, mid);
180 j = mid;
184 Xapian::termpos
185 BitReader::decode(Xapian::termpos outof, bool force)
187 (void)force;
188 Assert(force == di_current.is_initialized());
189 Xapian::termpos bits = highest_order_bit(outof - Xapian::termpos(1));
190 const Xapian::termpos spare = safe_shl(Xapian::termpos(1), bits) - outof;
191 const Xapian::termpos mid_start = (outof - spare) / 2;
192 Xapian::termpos pos;
193 if (spare) {
194 pos = read_bits(bits - 1);
195 if (pos < mid_start) {
196 if (read_bits(1)) pos += mid_start + spare;
198 } else {
199 pos = read_bits(bits);
201 Assert(pos < outof);
202 return pos;
205 Xapian::termpos
206 BitReader::read_bits(int count)
208 Xapian::termpos result;
209 if (count > int(sizeof(acc) * 8 - 7)) {
210 // If we need more than 7 bits less than fit in acc do the read in two
211 // goes to ensure that we don't overflow acc. This is a little more
212 // conservative than it needs to be, but such large values will
213 // inevitably be rare (because you can't fit very many of them into
214 // the full Xapian::termpos range).
215 Assert(count <= int(sizeof(acc) * 8));
216 const size_t half_the_bits = sizeof(acc) * 4;
217 result = read_bits(half_the_bits);
218 return result | (read_bits(count - half_the_bits) << half_the_bits);
220 while (n_bits < count) {
221 Assert(p < end);
222 acc |= Xapian::termpos(static_cast<unsigned char>(*p++)) << n_bits;
223 n_bits += 8;
225 result = acc & ((Xapian::termpos(1) << count) - Xapian::termpos(1));
226 acc >>= count;
227 n_bits -= count;
228 return result;
231 void
232 BitReader::decode_interpolative(int j, int k,
233 Xapian::termpos pos_j, Xapian::termpos pos_k)
235 Assert(!di_current.is_initialized());
236 di_stack.reserve(highest_order_bit(pos_k - pos_j));
237 di_current.set_j(j, pos_j);
238 di_current.set_k(k, pos_k);
241 Xapian::termpos
242 BitReader::decode_interpolative_next()
244 Assert(di_current.is_initialized());
245 while (!di_stack.empty() || di_current.is_next()) {
246 if (!di_current.is_next()) {
247 Xapian::termpos pos_ret = di_current.pos_k;
248 di_current = di_stack.back();
249 di_stack.pop_back();
250 int mid = (di_current.j + di_current.k) / 2;
251 di_current.set_j(mid, pos_ret);
252 return pos_ret;
254 di_stack.push_back(di_current);
255 int mid = (di_current.j + di_current.k) / 2;
256 Xapian::termpos pos_mid = decode(di_current.outof(), true) +
257 (di_current.pos_j + mid - di_current.j);
258 di_current.set_k(mid, pos_mid);
260 #ifdef XAPIAN_ASSERTIONS
261 di_current.uninit();
262 #endif
263 return di_current.pos_k;