2 * @brief Classes to encode/decode a bitstream.
4 /* Copyright (C) 2004,2005,2006,2008,2013,2014,2016,2017,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 #include "bitstream.h"
26 #include <xapian/types.h>
35 #if HAVE_DECL___BUILTIN_CLZ && \
36 HAVE_DECL___BUILTIN_CLZL && \
37 HAVE_DECL___BUILTIN_CLZLL
38 inline int do_clz(unsigned value
) { return __builtin_clz(value
); }
40 inline int do_clz(unsigned long value
) { return __builtin_clzl(value
); }
42 inline int do_clz(unsigned long long value
) { return __builtin_clzll(value
); }
47 // Highly optimised fls() implementation.
50 highest_order_bit(T mask
)
53 return mask
? sizeof(T
) * 8 - do_clz(mask
) : 0;
55 static const unsigned char flstab
[256] = {
56 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
57 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
58 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
59 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
60 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
61 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
62 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
63 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
64 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
65 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
66 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
67 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
68 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
69 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
70 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
71 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
76 if (mask
>= 0x100000000ul
) {
81 if (mask
>= 0x10000u
) {
89 return result
+ flstab
[mask
];
95 /// Shift left that's safe for shifts wider than the type.
96 template<typename T
, typename U
>
98 T
safe_shl(T x
, U shift
)
100 return (shift
>= sizeof(T
) * 8 ? 0 : x
<< shift
);
104 BitWriter::encode(Xapian::termpos value
, Xapian::termpos outof
)
106 Assert(value
< outof
);
107 unsigned bits
= highest_order_bit(outof
- Xapian::termpos(1));
108 const Xapian::termpos spare
= safe_shl(Xapian::termpos(1), bits
) - outof
;
110 /* If we have spare values, we can use one fewer bit to encode some
111 * values. We shorten the values in the middle of the range, as
112 * testing (on positional data) shows this works best. "Managing
113 * Gigabytes" suggests reversing this for the lowest level and encoding
114 * the end values of the range shorter, which is contrary to our
115 * testing (MG is talking about posting lists, which probably have
116 * different characteristics).
118 * For example, if outof is 11, the codes emitted are:
133 * Note the LSB comes first in the bitstream, so these codes need to be
134 * suffix-free to be decoded.
136 const Xapian::termpos mid_start
= (outof
- spare
) / 2;
137 if (value
>= mid_start
+ spare
) {
138 value
= (value
- (mid_start
+ spare
)) |
139 (Xapian::termpos(1) << (bits
- 1));
140 } else if (value
>= mid_start
) {
145 if (bits
+ n_bits
> sizeof(acc
) * 8) {
146 // We need to write more bits than there's empty room for in
147 // the accumulator. So we arrange to shift out 8 bits, then
148 // adjust things so we're adding 8 fewer bits.
149 Assert(bits
<= sizeof(acc
) * 8);
150 acc
|= (value
<< n_bits
);
156 acc
|= (value
<< n_bits
);
158 while (n_bits
>= 8) {
166 BitWriter::encode_interpolative(const Xapian::VecCOW
<Xapian::termpos
> &pos
, int j
, int k
)
168 // "Interpolative code" - for an algorithm description, see "Managing
169 // Gigabytes" - pages 126-127 in the second edition. You can probably
170 // view those pages in google books.
172 const Xapian::termpos mid
= j
+ (k
- j
) / 2;
173 // Encode one out of (pos[k] - pos[j] + 1) values
174 // (less some at either end because we must be able to fit
175 // all the intervening pos in)
176 const Xapian::termpos outof
= pos
[k
] - pos
[j
] + j
- k
+ 1;
177 const Xapian::termpos lowest
= pos
[j
] + mid
- j
;
178 encode(pos
[mid
] - lowest
, outof
);
179 encode_interpolative(pos
, j
, mid
);
185 BitReader::decode(Xapian::termpos outof
, bool force
)
188 Assert(force
== di_current
.is_initialized());
189 Xapian::termpos bits
= highest_order_bit(outof
- Xapian::termpos(1));
190 const Xapian::termpos spare
= safe_shl(Xapian::termpos(1), bits
) - outof
;
191 const Xapian::termpos mid_start
= (outof
- spare
) / 2;
194 pos
= read_bits(bits
- 1);
195 if (pos
< mid_start
) {
196 if (read_bits(1)) pos
+= mid_start
+ spare
;
199 pos
= read_bits(bits
);
206 BitReader::read_bits(int count
)
208 Xapian::termpos result
;
209 if (count
> int(sizeof(acc
) * 8 - 7)) {
210 // If we need more than 7 bits less than fit in acc do the read in two
211 // goes to ensure that we don't overflow acc. This is a little more
212 // conservative than it needs to be, but such large values will
213 // inevitably be rare (because you can't fit very many of them into
214 // the full Xapian::termpos range).
215 Assert(count
<= int(sizeof(acc
) * 8));
216 const size_t half_the_bits
= sizeof(acc
) * 4;
217 result
= read_bits(half_the_bits
);
218 return result
| (read_bits(count
- half_the_bits
) << half_the_bits
);
220 while (n_bits
< count
) {
222 acc
|= Xapian::termpos(static_cast<unsigned char>(*p
++)) << n_bits
;
225 result
= acc
& ((Xapian::termpos(1) << count
) - Xapian::termpos(1));
232 BitReader::decode_interpolative(int j
, int k
,
233 Xapian::termpos pos_j
, Xapian::termpos pos_k
)
235 Assert(!di_current
.is_initialized());
236 di_stack
.reserve(highest_order_bit(pos_k
- pos_j
));
237 di_current
.set_j(j
, pos_j
);
238 di_current
.set_k(k
, pos_k
);
242 BitReader::decode_interpolative_next()
244 Assert(di_current
.is_initialized());
245 while (!di_stack
.empty() || di_current
.is_next()) {
246 if (!di_current
.is_next()) {
247 Xapian::termpos pos_ret
= di_current
.pos_k
;
248 di_current
= di_stack
.back();
250 int mid
= (di_current
.j
+ di_current
.k
) / 2;
251 di_current
.set_j(mid
, pos_ret
);
254 di_stack
.push_back(di_current
);
255 int mid
= (di_current
.j
+ di_current
.k
) / 2;
256 Xapian::termpos pos_mid
= decode(di_current
.outof(), true) +
257 (di_current
.pos_j
+ mid
- di_current
.j
);
258 di_current
.set_k(mid
, pos_mid
);
260 #ifdef XAPIAN_ASSERTIONS
263 return di_current
.pos_k
;