[ci] Enable IRC notifications from travis
[xapian.git] / xapian-core / common / pack.h
blobc5c9667b2d2cc082692924fc7dd7bc50adba3db2
1 /** @file pack.h
2 * @brief Pack types into strings and unpack them again.
3 */
4 /* Copyright (C) 2009,2015,2016,2017,2018 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_PACK_H
22 #define XAPIAN_INCLUDED_PACK_H
24 #include <cstring>
25 #include <string>
26 #include <type_traits>
28 #include "omassert.h"
30 #include "xapian/types.h"
32 /** How many bits to store the length of a sortable uint in.
34 * Setting this to 2 limits us to 2**32 documents in the database. If set
35 * to 3, then 2**64 documents are possible, but the database format isn't
36 * compatible.
38 const unsigned int SORTABLE_UINT_LOG2_MAX_BYTES = 2;
40 /// Calculated value used below.
41 const unsigned int SORTABLE_UINT_MAX_BYTES = 1 << SORTABLE_UINT_LOG2_MAX_BYTES;
43 /// Calculated value used below.
44 const unsigned int SORTABLE_UINT_1ST_BYTE_MASK =
45 (0xffu >> SORTABLE_UINT_LOG2_MAX_BYTES);
47 /** Append an encoded bool to a string.
49 * @param s The string to append to.
50 * @param value The bool to encode.
52 inline void
53 pack_bool(std::string & s, bool value)
55 s += char('0' | static_cast<char>(value));
58 /** Decode a bool from a string.
60 * @param p Pointer to pointer to the current position in the string.
61 * @param end Pointer to the end of the string.
62 * @param result Where to store the result.
64 inline bool
65 unpack_bool(const char ** p, const char * end, bool * result)
67 Assert(result);
68 const char * & ptr = *p;
69 Assert(ptr);
70 char ch;
71 if (rare(ptr == end || ((ch = *ptr++ - '0') &~ 1))) {
72 ptr = NULL;
73 return false;
75 *result = static_cast<bool>(ch);
76 return true;
79 /** Append an encoded unsigned integer to a string as the last item.
81 * This encoding is only suitable when this is the last thing encoded as
82 * the encoding used doesn't contain its own length.
84 * @param s The string to append to.
85 * @param value The unsigned integer to encode.
87 template<class U>
88 inline void
89 pack_uint_last(std::string & s, U value)
91 static_assert(std::is_unsigned<U>::value, "Unsigned type required");
93 while (value) {
94 s += char(value & 0xff);
95 value >>= 8;
99 /** Decode an unsigned integer as the last item in a string.
101 * @param p Pointer to pointer to the current position in the string.
102 * @param end Pointer to the end of the string.
103 * @param result Where to store the result.
105 template<class U>
106 inline bool
107 unpack_uint_last(const char ** p, const char * end, U * result)
109 static_assert(std::is_unsigned<U>::value, "Unsigned type required");
110 Assert(result);
112 const char * ptr = *p;
113 Assert(ptr);
114 *p = end;
116 // Check for overflow.
117 if (rare(end - ptr > int(sizeof(U)))) {
118 return false;
121 *result = 0;
122 while (end != ptr) {
123 *result = (*result << 8) | U(static_cast<unsigned char>(*--end));
126 return true;
129 #if HAVE_DECL___BUILTIN_CLZ && \
130 HAVE_DECL___BUILTIN_CLZL && \
131 HAVE_DECL___BUILTIN_CLZLL
132 inline int do_clz(unsigned value) { return __builtin_clz(value); }
134 inline int do_clz(unsigned long value) { return __builtin_clzl(value); }
136 inline int do_clz(unsigned long long value) { return __builtin_clzll(value); }
138 # define HAVE_DO_CLZ
139 #endif
141 /** Append an encoded unsigned integer to a string, preserving the sort order.
143 * The appended string data will sort in the same order as the unsigned
144 * integer being encoded.
146 * Note that the first byte of the encoding will never be \xff, so it is
147 * safe to store the result of this function immediately after the result of
148 * pack_string_preserving_sort().
150 * @param s The string to append to.
151 * @param value The unsigned integer to encode.
153 template<class U>
154 inline void
155 pack_uint_preserving_sort(std::string & s, U value)
157 static_assert(std::is_unsigned<U>::value, "Unsigned type required");
158 static_assert(sizeof(U) <= 8,
159 "Template type U too wide for database format");
160 // The clz() functions are undefined for 0, so handle the smallest band
161 // as a special case.
162 if (value < 0x8000) {
163 s.resize(s.size() + 2);
164 s[s.size() - 2] = static_cast<unsigned char>(value >> 8);
165 Assert(s[s.size() - 2] != '\xff');
166 s[s.size() - 1] = static_cast<unsigned char>(value);
167 return;
170 #ifdef HAVE_DO_CLZ
171 size_t len = ((sizeof(U) * 8 + 5) - do_clz(value)) / 7;
172 #else
173 size_t len = 3;
174 for (U x = value >> 22; x; x >>= 7) ++len;
175 #endif
176 unsigned mask = 0xff << (10 - len);
178 s.resize(s.size() + len);
179 for (size_t i = 1; i != len; ++i) {
180 s[s.size() - i] = static_cast<unsigned char>(value);
181 value >>= 8;
184 s[s.size() - len] = static_cast<unsigned char>(value | mask);
185 Assert(s[s.size() - len] != '\xff');
187 AssertRel(len, >, 2);
188 AssertRel(len, <=, 9);
191 /** Decode an "sort preserved" unsigned integer from a string.
193 * The unsigned integer must have been encoded with
194 * pack_uint_preserving_sort().
196 * @param p Pointer to pointer to the current position in the string.
197 * @param end Pointer to the end of the string.
198 * @param result Where to store the result.
200 template<class U>
201 inline bool
202 unpack_uint_preserving_sort(const char ** p, const char * end, U * result)
204 static_assert(std::is_unsigned<U>::value, "Unsigned type required");
205 static_assert(sizeof(U) <= 8,
206 "Template type U too wide for database format");
207 Assert(result);
209 const char * ptr = *p;
210 Assert(ptr);
212 if (rare(ptr == end)) {
213 return false;
216 unsigned char len_byte = static_cast<unsigned char>(*ptr++);
217 if (len_byte < 0x80) {
218 *result = (U(len_byte) << 8) | static_cast<unsigned char>(*ptr++);
219 *p = ptr;
220 return true;
223 if (len_byte == 0xff) {
224 return false;
227 // len is how many bytes there are after the length byte.
228 #ifdef HAVE_DO_CLZ
229 size_t len = do_clz(len_byte ^ 0xffu) + 9 - sizeof(unsigned) * 8;
230 #else
231 size_t len = 2;
232 for (unsigned char m = 0x40; len_byte & m; m >>= 1) ++len;
233 #endif
234 if (rare(size_t(end - ptr) < len)) {
235 return false;
237 unsigned mask = 0xff << (9 - len);
238 len_byte &= ~mask;
240 // Check for overflow.
241 if (rare(len > int(sizeof(U)))) return false;
242 if (sizeof(U) != 8) {
243 // Need to check the top byte too.
244 if (rare(len == int(sizeof(U)) && len_byte != 0)) return false;
247 end = ptr + len;
248 *p = end;
250 U r = len_byte;
251 while (ptr != end) {
252 r = (r << 8) | U(static_cast<unsigned char>(*ptr++));
254 *result = r;
256 return true;
259 /** Append an encoded unsigned integer to a string.
261 * @param s The string to append to.
262 * @param value The unsigned integer to encode.
264 template<class U>
265 inline void
266 pack_uint(std::string & s, U value)
268 static_assert(std::is_unsigned<U>::value, "Unsigned type required");
270 while (value >= 128) {
271 s += static_cast<char>(static_cast<unsigned char>(value) | 0x80);
272 value >>= 7;
274 s += static_cast<char>(value);
277 /** Append an encoded unsigned integer (bool type) to a string.
279 * @param s The string to append to.
280 * @param value The unsigned integer to encode.
282 template<>
283 inline void
284 pack_uint(std::string & s, bool value)
286 s += static_cast<char>(value);
289 /** Decode an unsigned integer from a string.
291 * @param p Pointer to pointer to the current position in the string.
292 * @param end Pointer to the end of the string.
293 * @param result Where to store the result (or NULL to just skip it).
295 template<class U>
296 inline bool
297 unpack_uint(const char ** p, const char * end, U * result)
299 static_assert(std::is_unsigned<U>::value, "Unsigned type required");
301 const char * ptr = *p;
302 Assert(ptr);
303 const char * start = ptr;
305 // Check the length of the encoded integer first.
306 do {
307 if (rare(ptr == end)) {
308 // Out of data.
309 *p = NULL;
310 return false;
312 } while (static_cast<unsigned char>(*ptr++) >= 128);
314 *p = ptr;
316 if (!result) return true;
318 *result = U(*--ptr);
319 if (ptr == start) {
320 // Special case for small values.
321 return true;
324 size_t maxbits = size_t(ptr - start) * 7;
325 if (maxbits <= sizeof(U) * 8) {
326 // No possibility of overflow.
327 do {
328 unsigned char chunk = static_cast<unsigned char>(*--ptr) & 0x7f;
329 *result = (*result << 7) | U(chunk);
330 } while (ptr != start);
331 return true;
334 size_t minbits = maxbits - 6;
335 if (rare(minbits > sizeof(U) * 8)) {
336 // Overflow.
337 return false;
340 while (--ptr != start) {
341 unsigned char chunk = static_cast<unsigned char>(*--ptr) & 0x7f;
342 *result = (*result << 7) | U(chunk);
345 U tmp = *result;
346 *result <<= 7;
347 if (rare(*result < tmp)) {
348 // Overflow.
349 return false;
351 *result |= U(static_cast<unsigned char>(*ptr) & 0x7f);
352 return true;
355 /** Decode an unsigned integer from a string, going backwards.
357 * @param p Pointer to pointer just after the position in the string.
358 * @param start Pointer to the start of the string.
359 * @param result Where to store the result (or NULL to just skip it).
361 template<class U>
362 inline bool
363 unpack_uint_backwards(const char ** p, const char * start, U * result)
365 static_assert(std::is_unsigned<U>::value, "Unsigned type required");
367 const char * ptr = *p;
368 Assert(ptr);
370 // Check it's not empty and that the final byte is valid.
371 if (rare(ptr == start || static_cast<unsigned char>(ptr[-1]) >= 128)) {
372 // Out of data.
373 *p = NULL;
374 return false;
377 do {
378 if (rare(--ptr == start))
379 break;
380 } while (static_cast<unsigned char>(ptr[-1]) >= 128);
382 const char* end = *p;
383 *p = ptr;
384 return unpack_uint(&ptr, end, result);
387 /** Append an encoded std::string to a string.
389 * @param s The string to append to.
390 * @param value The std::string to encode.
392 inline void
393 pack_string(std::string & s, const std::string & value)
395 pack_uint(s, value.size());
396 s += value;
399 /** Append an encoded C-style string to a string.
401 * @param s The string to append to.
402 * @param ptr The C-style string to encode.
404 inline void
405 pack_string(std::string & s, const char * ptr)
407 Assert(ptr);
408 size_t len = std::strlen(ptr);
409 pack_uint(s, len);
410 s.append(ptr, len);
413 /** Decode a std::string from a string.
415 * @param p Pointer to pointer to the current position in the string.
416 * @param end Pointer to the end of the string.
417 * @param result Where to store the result.
419 inline bool
420 unpack_string(const char ** p, const char * end, std::string & result)
422 size_t len;
423 if (rare(!unpack_uint(p, end, &len))) {
424 return false;
427 const char * & ptr = *p;
428 if (rare(len > size_t(end - ptr))) {
429 ptr = NULL;
430 return false;
433 result.assign(ptr, len);
434 ptr += len;
435 return true;
438 /** Append an encoded std::string to a string, preserving the sort order.
440 * The byte which follows this encoded value *must not* be \xff, or the sort
441 * order won't be correct. You may need to store a padding byte (\0 say) to
442 * ensure this. Note that pack_uint_preserving_sort() can never produce
443 * \xff as its first byte so is safe to use immediately afterwards.
445 * @param s The string to append to.
446 * @param value The std::string to encode.
447 * @param last If true, this is the last thing to be encoded in this
448 * string - see note below (default: false)
450 * It doesn't make sense to use pack_string_preserving_sort() if nothing can
451 * ever follow, but if optional items can, you can set last=true in cases
452 * where nothing does and get a shorter encoding in those cases.
454 inline void
455 pack_string_preserving_sort(std::string & s, const std::string & value,
456 bool last = false)
458 std::string::size_type b = 0, e;
459 while ((e = value.find('\0', b)) != std::string::npos) {
460 ++e;
461 s.append(value, b, e - b);
462 s += '\xff';
463 b = e;
465 s.append(value, b, std::string::npos);
466 if (!last) s += '\0';
469 /** Decode a "sort preserved" std::string from a string.
471 * The std::string must have been encoded with pack_string_preserving_sort().
473 * @param p Pointer to pointer to the current position in the string.
474 * @param end Pointer to the end of the string.
475 * @param result Where to store the result.
477 inline bool
478 unpack_string_preserving_sort(const char ** p, const char * end,
479 std::string & result)
481 result.resize(0);
483 const char *ptr = *p;
484 Assert(ptr);
486 while (ptr != end) {
487 char ch = *ptr++;
488 if (rare(ch == '\0')) {
489 if (usual(ptr == end || *ptr != '\xff')) {
490 break;
492 ++ptr;
494 result += ch;
496 *p = ptr;
497 return true;
500 inline std::string
501 pack_glass_postlist_key(const std::string &term)
503 // Special case for doclen lists.
504 if (term.empty())
505 return std::string("\x00\xe0", 2);
507 std::string key;
508 pack_string_preserving_sort(key, term, true);
509 return key;
512 inline std::string
513 pack_glass_postlist_key(const std::string &term, Xapian::docid did)
515 // Special case for doclen lists.
516 if (term.empty()) {
517 std::string key("\x00\xe0", 2);
518 pack_uint_preserving_sort(key, did);
519 return key;
522 std::string key;
523 pack_string_preserving_sort(key, term);
524 pack_uint_preserving_sort(key, did);
525 return key;
528 inline std::string
529 pack_honey_postlist_key(const std::string& term)
531 Assert(!term.empty());
532 std::string key;
533 pack_string_preserving_sort(key, term, true);
534 return key;
537 inline std::string
538 pack_honey_postlist_key(const std::string& term, Xapian::docid did)
540 Assert(!term.empty());
541 std::string key;
542 pack_string_preserving_sort(key, term);
543 pack_uint_preserving_sort(key, did);
544 return key;
547 #endif // XAPIAN_INCLUDED_PACK_H