Remove "[Glass and newer variant]" comment notes
[xapian.git] / xapian-core / common / pack.h
blob1f911f5db0253c1b769bffe1763bede24ff491dc
1 /** @file pack.h
2 * @brief Pack types into strings and unpack them again.
3 */
4 /* Copyright (C) 2009,2015,2016,2017 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_PACK_H
22 #define XAPIAN_INCLUDED_PACK_H
24 #include <cstring>
25 #include <string>
26 #include <type_traits>
28 #include "omassert.h"
30 #include "xapian/types.h"
32 /** How many bits to store the length of a sortable uint in.
34 * Setting this to 2 limits us to 2**32 documents in the database. If set
35 * to 3, then 2**64 documents are possible, but the database format isn't
36 * compatible.
38 const unsigned int SORTABLE_UINT_LOG2_MAX_BYTES = 2;
40 /// Calculated value used below.
41 const unsigned int SORTABLE_UINT_MAX_BYTES = 1 << SORTABLE_UINT_LOG2_MAX_BYTES;
43 /// Calculated value used below.
44 const unsigned int SORTABLE_UINT_1ST_BYTE_MASK =
45 (0xffu >> SORTABLE_UINT_LOG2_MAX_BYTES);
47 /** Append an encoded bool to a string.
49 * @param s The string to append to.
50 * @param value The bool to encode.
52 inline void
53 pack_bool(std::string & s, bool value)
55 s += char('0' | static_cast<char>(value));
58 /** Decode a bool from a string.
60 * @param p Pointer to pointer to the current position in the string.
61 * @param end Pointer to the end of the string.
62 * @param result Where to store the result.
64 inline bool
65 unpack_bool(const char ** p, const char * end, bool * result)
67 Assert(result);
68 const char * & ptr = *p;
69 Assert(ptr);
70 char ch;
71 if (rare(ptr == end || ((ch = *ptr++ - '0') &~ 1))) {
72 ptr = NULL;
73 return false;
75 *result = static_cast<bool>(ch);
76 return true;
79 /** Append an encoded unsigned integer to a string as the last item.
81 * This encoding is only suitable when this is the last thing encoded as
82 * the encoding used doesn't contain its own length.
84 * @param s The string to append to.
85 * @param value The unsigned integer to encode.
87 template<class U>
88 inline void
89 pack_uint_last(std::string & s, U value)
91 static_assert(std::is_unsigned<U>::value, "Unsigned type required");
93 while (value) {
94 s += char(value & 0xff);
95 value >>= 8;
99 /** Decode an unsigned integer as the last item in a string.
101 * @param p Pointer to pointer to the current position in the string.
102 * @param end Pointer to the end of the string.
103 * @param result Where to store the result.
105 template<class U>
106 inline bool
107 unpack_uint_last(const char ** p, const char * end, U * result)
109 static_assert(std::is_unsigned<U>::value, "Unsigned type required");
110 Assert(result);
112 const char * ptr = *p;
113 Assert(ptr);
114 *p = end;
116 // Check for overflow.
117 if (rare(end - ptr > int(sizeof(U)))) {
118 return false;
121 *result = 0;
122 while (end != ptr) {
123 *result = (*result << 8) | U(static_cast<unsigned char>(*--end));
126 return true;
129 #if HAVE_DECL___BUILTIN_CLZ && \
130 HAVE_DECL___BUILTIN_CLZL && \
131 HAVE_DECL___BUILTIN_CLZLL
132 inline int do_clz(unsigned value) { return __builtin_clz(value); }
134 inline int do_clz(unsigned long value) { return __builtin_clzl(value); }
136 inline int do_clz(unsigned long long value) { return __builtin_clzll(value); }
138 # define HAVE_DO_CLZ
139 #endif
141 /** Append an encoded unsigned integer to a string, preserving the sort order.
143 * The appended string data will sort in the same order as the unsigned
144 * integer being encoded.
146 * Note that the first byte of the encoding will never be \xff, so it is
147 * safe to store the result of this function immediately after the result of
148 * pack_string_preserving_sort().
150 * @param s The string to append to.
151 * @param value The unsigned integer to encode.
153 template<class U>
154 inline void
155 pack_uint_preserving_sort(std::string & s, U value)
157 static_assert(std::is_unsigned<U>::value, "Unsigned type required");
158 static_assert(sizeof(U) <= 8,
159 "Template type U too wide for database format");
160 // The clz() functions are undefined for 0, so handle the smallest band
161 // as a special case.
162 if (value < 0x8000) {
163 s.resize(s.size() + 2);
164 s[s.size() - 2] = static_cast<unsigned char>(value >> 8);
165 Assert(s[s.size() - 2] != '\xff');
166 s[s.size() - 1] = static_cast<unsigned char>(value);
167 return;
170 #ifdef HAVE_DO_CLZ
171 size_t len = ((sizeof(U) * 8 + 5) - do_clz(value)) / 7;
172 #else
173 size_t len = 3;
174 for (U x = value >> 22; x; x >>= 7) ++len;
175 #endif
176 unsigned mask = 0xff << (10 - len);
178 s.resize(s.size() + len);
179 for (size_t i = 1; i != len; ++i) {
180 s[s.size() - i] = static_cast<unsigned char>(value);
181 value >>= 8;
184 s[s.size() - len] = static_cast<unsigned char>(value | mask);
185 Assert(s[s.size() - len] != '\xff');
187 AssertRel(len, >, 2);
188 AssertRel(len, <=, 9);
191 /** Decode an "sort preserved" unsigned integer from a string.
193 * The unsigned integer must have been encoded with
194 * pack_uint_preserving_sort().
196 * @param p Pointer to pointer to the current position in the string.
197 * @param end Pointer to the end of the string.
198 * @param result Where to store the result.
200 template<class U>
201 inline bool
202 unpack_uint_preserving_sort(const char ** p, const char * end, U * result)
204 static_assert(std::is_unsigned<U>::value, "Unsigned type required");
205 static_assert(sizeof(U) <= 8,
206 "Template type U too wide for database format");
207 Assert(result);
209 const char * ptr = *p;
210 Assert(ptr);
212 if (rare(ptr == end)) {
213 return false;
216 unsigned char len_byte = static_cast<unsigned char>(*ptr++);
217 if (len_byte < 0x80) {
218 *result = (U(len_byte) << 8) | static_cast<unsigned char>(*ptr++);
219 *p = ptr;
220 return true;
223 if (len_byte == 0xff) {
224 return false;
227 // len is how many bytes there are after the length byte.
228 #ifdef HAVE_DO_CLZ
229 size_t len = do_clz(len_byte ^ 0xffu) + 9 - sizeof(unsigned) * 8;
230 #else
231 size_t len = 2;
232 for (unsigned char m = 0x40; len_byte & m; m >>= 1) ++len;
233 #endif
234 if (rare(size_t(end - ptr) < len)) {
235 return false;
237 unsigned mask = 0xff << (9 - len);
238 len_byte &= ~mask;
240 // Check for overflow.
241 if (rare(len > int(sizeof(U)))) return false;
242 if (sizeof(U) != 8) {
243 // Need to check the top byte too.
244 if (rare(len == int(sizeof(U)) && len_byte != 0)) return false;
247 end = ptr + len;
248 *p = end;
250 U r = len_byte;
251 while (ptr != end) {
252 r = (r << 8) | U(static_cast<unsigned char>(*ptr++));
254 *result = r;
256 return true;
259 /** Append an encoded unsigned integer to a string.
261 * @param s The string to append to.
262 * @param value The unsigned integer to encode.
264 template<class U>
265 inline void
266 pack_uint(std::string & s, U value)
268 static_assert(std::is_unsigned<U>::value, "Unsigned type required");
270 while (value >= 128) {
271 s += static_cast<char>(static_cast<unsigned char>(value) | 0x80);
272 value >>= 7;
274 s += static_cast<char>(value);
277 /** Append an encoded unsigned integer (bool type) to a string.
279 * @param s The string to append to.
280 * @param value The unsigned integer to encode.
282 template<>
283 inline void
284 pack_uint(std::string & s, bool value)
286 s += static_cast<char>(value);
289 /** Decode an unsigned integer from a string.
291 * @param p Pointer to pointer to the current position in the string.
292 * @param end Pointer to the end of the string.
293 * @param result Where to store the result (or NULL to just skip it).
295 template<class U>
296 inline bool
297 unpack_uint(const char ** p, const char * end, U * result)
299 static_assert(std::is_unsigned<U>::value, "Unsigned type required");
301 const char * ptr = *p;
302 Assert(ptr);
303 const char * start = ptr;
305 // Check the length of the encoded integer first.
306 do {
307 if (rare(ptr == end)) {
308 // Out of data.
309 *p = NULL;
310 return false;
312 } while (static_cast<unsigned char>(*ptr++) >= 128);
314 *p = ptr;
316 if (!result) return true;
318 *result = U(*--ptr);
319 if (ptr == start) {
320 // Special case for small values.
321 return true;
324 size_t maxbits = size_t(ptr - start) * 7;
325 if (maxbits <= sizeof(U) * 8) {
326 // No possibility of overflow.
327 do {
328 unsigned char chunk = static_cast<unsigned char>(*--ptr) & 0x7f;
329 *result = (*result << 7) | U(chunk);
330 } while (ptr != start);
331 return true;
334 size_t minbits = maxbits - 6;
335 if (rare(minbits > sizeof(U) * 8)) {
336 // Overflow.
337 return false;
340 while (--ptr != start) {
341 unsigned char chunk = static_cast<unsigned char>(*--ptr) & 0x7f;
342 *result = (*result << 7) | U(chunk);
345 U tmp = *result;
346 *result <<= 7;
347 if (rare(*result < tmp)) {
348 // Overflow.
349 return false;
351 *result |= U(static_cast<unsigned char>(*ptr) & 0x7f);
352 return true;
355 /** Append an encoded std::string to a string.
357 * @param s The string to append to.
358 * @param value The std::string to encode.
360 inline void
361 pack_string(std::string & s, const std::string & value)
363 pack_uint(s, value.size());
364 s += value;
367 /** Append an encoded C-style string to a string.
369 * @param s The string to append to.
370 * @param ptr The C-style string to encode.
372 inline void
373 pack_string(std::string & s, const char * ptr)
375 Assert(ptr);
376 size_t len = std::strlen(ptr);
377 pack_uint(s, len);
378 s.append(ptr, len);
381 /** Decode a std::string from a string.
383 * @param p Pointer to pointer to the current position in the string.
384 * @param end Pointer to the end of the string.
385 * @param result Where to store the result.
387 inline bool
388 unpack_string(const char ** p, const char * end, std::string & result)
390 size_t len;
391 if (rare(!unpack_uint(p, end, &len))) {
392 return false;
395 const char * & ptr = *p;
396 if (rare(len > size_t(end - ptr))) {
397 ptr = NULL;
398 return false;
401 result.assign(ptr, len);
402 ptr += len;
403 return true;
406 /** Append an encoded std::string to a string, preserving the sort order.
408 * The byte which follows this encoded value *must not* be \xff, or the sort
409 * order won't be correct. You may need to store a padding byte (\0 say) to
410 * ensure this. Note that pack_uint_preserving_sort() can never produce
411 * \xff as its first byte so is safe to use immediately afterwards.
413 * @param s The string to append to.
414 * @param value The std::string to encode.
415 * @param last If true, this is the last thing to be encoded in this
416 * string - see note below (default: false)
418 * It doesn't make sense to use pack_string_preserving_sort() if nothing can
419 * ever follow, but if optional items can, you can set last=true in cases
420 * where nothing does and get a shorter encoding in those cases.
422 inline void
423 pack_string_preserving_sort(std::string & s, const std::string & value,
424 bool last = false)
426 std::string::size_type b = 0, e;
427 while ((e = value.find('\0', b)) != std::string::npos) {
428 ++e;
429 s.append(value, b, e - b);
430 s += '\xff';
431 b = e;
433 s.append(value, b, std::string::npos);
434 if (!last) s += '\0';
437 /** Decode a "sort preserved" std::string from a string.
439 * The std::string must have been encoded with pack_string_preserving_sort().
441 * @param p Pointer to pointer to the current position in the string.
442 * @param end Pointer to the end of the string.
443 * @param result Where to store the result.
445 inline bool
446 unpack_string_preserving_sort(const char ** p, const char * end,
447 std::string & result)
449 result.resize(0);
451 const char *ptr = *p;
452 Assert(ptr);
454 while (ptr != end) {
455 char ch = *ptr++;
456 if (rare(ch == '\0')) {
457 if (usual(ptr == end || *ptr != '\xff')) {
458 break;
460 ++ptr;
462 result += ch;
464 *p = ptr;
465 return true;
468 inline std::string
469 pack_glass_postlist_key(const std::string &term)
471 // Special case for doclen lists.
472 if (term.empty())
473 return std::string("\x00\xe0", 2);
475 std::string key;
476 pack_string_preserving_sort(key, term, true);
477 return key;
480 inline std::string
481 pack_glass_postlist_key(const std::string &term, Xapian::docid did)
483 // Special case for doclen lists.
484 if (term.empty()) {
485 std::string key("\x00\xe0", 2);
486 pack_uint_preserving_sort(key, did);
487 return key;
490 std::string key;
491 pack_string_preserving_sort(key, term);
492 pack_uint_preserving_sort(key, did);
493 return key;
496 #endif // XAPIAN_INCLUDED_PACK_H