Support 64-bit docids in the glass backend.
[xapian.git] / xapian-core / common / pack.h
blob7792321180d82c0d08cd4c0dec6213897399c789
1 /** @file pack.h
2 * @brief Pack types into strings and unpack them again.
3 */
4 /* Copyright (C) 2009,2015 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_PACK_H
22 #define XAPIAN_INCLUDED_PACK_H
24 #include <cstring>
25 #include <string>
27 #include "omassert.h"
29 #include "xapian/types.h"
31 /** How many bits to store the length of a sortable uint in.
33 * Setting this to 2 limits us to 2**32 documents in the database. If set
34 * to 3, then 2**64 documents are possible, but the database format isn't
35 * compatible.
37 const unsigned int SORTABLE_UINT_LOG2_MAX_BYTES = 2;
39 /// Calculated value used below.
40 const unsigned int SORTABLE_UINT_MAX_BYTES = 1 << SORTABLE_UINT_LOG2_MAX_BYTES;
42 /// Calculated value used below.
43 const unsigned int SORTABLE_UINT_1ST_BYTE_MASK =
44 (0xffu >> SORTABLE_UINT_LOG2_MAX_BYTES);
46 /** Append an encoded bool to a string.
48 * @param s The string to append to.
49 * @param value The bool to encode.
51 inline void
52 pack_bool(std::string & s, bool value)
54 s += char('0' | static_cast<char>(value));
57 /** Decode a bool from a string.
59 * @param p Pointer to pointer to the current position in the string.
60 * @param end Pointer to the end of the string.
61 * @param result Where to store the result.
63 inline bool
64 unpack_bool(const char ** p, const char * end, bool * result)
66 Assert(result);
67 const char * & ptr = *p;
68 Assert(ptr);
69 char ch;
70 if (rare(ptr == end || ((ch = *ptr++ - '0') &~ 1))) {
71 ptr = NULL;
72 return false;
74 *result = static_cast<bool>(ch);
75 return true;
78 /** Append an encoded unsigned integer to a string as the last item.
80 * This encoding is only suitable when this is the last thing encoded as
81 * the encoding used doesn't contain its own length.
83 * @param s The string to append to.
84 * @param value The unsigned integer to encode.
86 template<class U>
87 inline void
88 pack_uint_last(std::string & s, U value)
90 // Check U is an unsigned type.
91 STATIC_ASSERT_UNSIGNED_TYPE(U);
93 while (value) {
94 s += char(value & 0xff);
95 value >>= 8;
99 /** Decode an unsigned integer as the last item in a string.
101 * @param p Pointer to pointer to the current position in the string.
102 * @param end Pointer to the end of the string.
103 * @param result Where to store the result.
105 template<class U>
106 inline bool
107 unpack_uint_last(const char ** p, const char * end, U * result)
109 // Check U is an unsigned type.
110 STATIC_ASSERT_UNSIGNED_TYPE(U);
111 Assert(result);
113 const char * ptr = *p;
114 Assert(ptr);
115 *p = end;
117 // Check for overflow.
118 if (rare(end - ptr > int(sizeof(U)))) {
119 return false;
122 *result = 0;
123 while (end != ptr) {
124 *result = (*result << 8) | U(static_cast<unsigned char>(*--end));
127 return true;
130 /** Append an encoded unsigned integer to a string, preserving the sort order.
132 * [Chert variant]
134 * The appended string data will sort in the same order as the unsigned
135 * integer being encoded.
137 * Note that the first byte of the encoding will never be \xff, so it is
138 * safe to store the result of this function immediately after the result of
139 * pack_string_preserving_sort().
141 * @param s The string to append to.
142 * @param value The unsigned integer to encode.
144 template<class U>
145 inline void
146 C_pack_uint_preserving_sort(std::string & s, U value)
148 // Check U is an unsigned type.
149 STATIC_ASSERT_UNSIGNED_TYPE(U);
150 #if 0
151 // FIXME: Doesn't work with 64-bit Xapian::docid, etc.
152 static_assert(sizeof(U) <= SORTABLE_UINT_MAX_BYTES,
153 "Template type U too wide for database format");
154 #endif
156 char tmp[sizeof(U) + 1];
157 char * p = tmp + sizeof(tmp);
159 do {
160 *--p = char(value & 0xff);
161 value >>= 8;
162 } while (value &~ SORTABLE_UINT_1ST_BYTE_MASK);
164 unsigned char len = static_cast<unsigned char>(tmp + sizeof(tmp) - p);
165 *--p = char((len - 1) << (8 - SORTABLE_UINT_LOG2_MAX_BYTES) | value);
166 s.append(p, len + 1);
167 Assert(s[0] != '\xff');
170 /** Decode an "sort preserved" unsigned integer from a string.
172 * [Chert variant]
174 * The unsigned integer must have been encoded with
175 * C_pack_uint_preserving_sort().
177 * @param p Pointer to pointer to the current position in the string.
178 * @param end Pointer to the end of the string.
179 * @param result Where to store the result.
181 template<class U>
182 inline bool
183 C_unpack_uint_preserving_sort(const char ** p, const char * end, U * result)
185 // Check U is an unsigned type.
186 STATIC_ASSERT_UNSIGNED_TYPE(U);
187 static_assert(sizeof(U) < 256,
188 "Template type U too wide for database format");
189 Assert(result);
191 const char * ptr = *p;
192 Assert(ptr);
194 if (rare(ptr == end)) {
195 return false;
198 unsigned char len_byte = static_cast<unsigned char>(*ptr++);
199 *result = len_byte & SORTABLE_UINT_1ST_BYTE_MASK;
200 size_t len = (len_byte >> (8 - SORTABLE_UINT_LOG2_MAX_BYTES)) + 1;
202 if (rare(size_t(end - ptr) < len)) {
203 return false;
206 end = ptr + len;
207 *p = end;
209 // Check for overflow.
210 if (rare(len > int(sizeof(U)))) {
211 return false;
214 while (ptr != end) {
215 *result = (*result << 8) | U(static_cast<unsigned char>(*ptr++));
218 return true;
221 #ifdef __GNUC__
222 // GCC 3.4 added __builtin_clz() (with l and ll variants).
223 inline int do_clz(unsigned value) { return __builtin_clz(value); }
225 inline int do_clz(unsigned long value) { return __builtin_clzl(value); }
227 inline int do_clz(unsigned long long value) { return __builtin_clzll(value); }
229 # define HAVE_DO_CLZ
230 #endif
232 /** Append an encoded unsigned integer to a string, preserving the sort order.
234 * [Glass and newer variant]
236 * The appended string data will sort in the same order as the unsigned
237 * integer being encoded.
239 * Note that the first byte of the encoding will never be \xff, so it is
240 * safe to store the result of this function immediately after the result of
241 * pack_string_preserving_sort().
243 * @param s The string to append to.
244 * @param value The unsigned integer to encode.
246 template<class U>
247 inline void
248 pack_uint_preserving_sort(std::string & s, U value)
250 // Check U is an unsigned type.
251 STATIC_ASSERT_UNSIGNED_TYPE(U);
252 static_assert(sizeof(U) <= 8,
253 "Template type U too wide for database format");
254 // The clz() functions are undefined for 0, so handle the smallest band
255 // as a special case.
256 if (value < 0x8000) {
257 s.resize(s.size() + 2);
258 s[s.size() - 2] = static_cast<unsigned char>(value >> 8);
259 Assert(s[s.size() - 2] != '\xff');
260 s[s.size() - 1] = static_cast<unsigned char>(value);
261 return;
264 #ifdef HAVE_DO_CLZ
265 size_t len = ((sizeof(U) * 8 + 5) - do_clz(value)) / 7;
266 #else
267 size_t len = 3;
268 for (U x = value >> 22; x; x >>= 7) ++len;
269 #endif
270 unsigned mask = 0xff << (10 - len);
272 s.resize(s.size() + len);
273 for (size_t i = 1; i != len; ++i) {
274 s[s.size() - i] = static_cast<unsigned char>(value);
275 value >>= 8;
278 s[s.size() - len] = static_cast<unsigned char>(value | mask);
279 Assert(s[s.size() - len] != '\xff');
281 AssertRel(len, >, 2);
282 AssertRel(len, <=, 9);
285 /** Decode an "sort preserved" unsigned integer from a string.
287 * [Glass and newer variant]
289 * The unsigned integer must have been encoded with
290 * pack_uint_preserving_sort().
292 * @param p Pointer to pointer to the current position in the string.
293 * @param end Pointer to the end of the string.
294 * @param result Where to store the result.
296 template<class U>
297 inline bool
298 unpack_uint_preserving_sort(const char ** p, const char * end, U * result)
300 // Check U is an unsigned type.
301 STATIC_ASSERT_UNSIGNED_TYPE(U);
302 static_assert(sizeof(U) <= 8,
303 "Template type U too wide for database format");
304 Assert(result);
306 const char * ptr = *p;
307 Assert(ptr);
309 if (rare(ptr == end)) {
310 return false;
313 unsigned char len_byte = static_cast<unsigned char>(*ptr++);
314 if (len_byte < 0x80) {
315 *result = (U(len_byte) << 8) | static_cast<unsigned char>(*ptr++);
316 *p = ptr;
317 return true;
320 if (len_byte == 0xff) {
321 return false;
324 // len is how many bytes there are after the length byte.
325 #ifdef HAVE_DO_CLZ
326 size_t len = do_clz(len_byte ^ 0xffu) + 9 - sizeof(unsigned) * 8;
327 #else
328 size_t len = 2;
329 for (unsigned char m = 0x40; len_byte & m; m >>= 1) ++len;
330 #endif
331 if (rare(size_t(end - ptr) < len)) {
332 return false;
334 unsigned mask = 0xff << (9 - len);
335 len_byte &= ~mask;
337 // Check for overflow.
338 if (rare(len > int(sizeof(U)))) return false;
339 if (sizeof(U) != 8) {
340 // Need to check the top byte too.
341 if (rare(len == int(sizeof(U)) && len_byte != 0)) return false;
344 end = ptr + len;
345 *p = end;
347 U r = len_byte;
348 while (ptr != end) {
349 r = (r << 8) | U(static_cast<unsigned char>(*ptr++));
351 *result = r;
353 return true;
356 /** Append an encoded unsigned integer to a string.
358 * @param s The string to append to.
359 * @param value The unsigned integer to encode.
361 template<class U>
362 inline void
363 pack_uint(std::string & s, U value)
365 // Check U is an unsigned type.
366 STATIC_ASSERT_UNSIGNED_TYPE(U);
368 while (value >= 128) {
369 s += static_cast<char>(static_cast<unsigned char>(value) | 0x80);
370 value >>= 7;
372 s += static_cast<char>(value);
375 /** Append an encoded unsigned integer (bool type) to a string.
377 * @param s The string to append to.
378 * @param value The unsigned integer to encode.
380 template<>
381 inline void
382 pack_uint(std::string & s, bool value)
384 s += static_cast<char>(value);
387 /** Decode an unsigned integer from a string.
389 * @param p Pointer to pointer to the current position in the string.
390 * @param end Pointer to the end of the string.
391 * @param result Where to store the result (or NULL to just skip it).
393 template<class U>
394 inline bool
395 unpack_uint(const char ** p, const char * end, U * result)
397 // Check U is an unsigned type.
398 STATIC_ASSERT_UNSIGNED_TYPE(U);
400 const char * ptr = *p;
401 Assert(ptr);
402 const char * start = ptr;
404 // Check the length of the encoded integer first.
405 do {
406 if (rare(ptr == end)) {
407 // Out of data.
408 *p = NULL;
409 return false;
411 } while (static_cast<unsigned char>(*ptr++) >= 128);
413 *p = ptr;
415 if (!result) return true;
417 *result = U(*--ptr);
418 if (ptr == start) {
419 // Special case for small values.
420 return true;
423 size_t maxbits = size_t(ptr - start) * 7;
424 if (maxbits <= sizeof(U) * 8) {
425 // No possibility of overflow.
426 do {
427 unsigned char chunk = static_cast<unsigned char>(*--ptr) & 0x7f;
428 *result = (*result << 7) | U(chunk);
429 } while (ptr != start);
430 return true;
433 size_t minbits = maxbits - 6;
434 if (rare(minbits > sizeof(U) * 8)) {
435 // Overflow.
436 return false;
439 while (--ptr != start) {
440 unsigned char chunk = static_cast<unsigned char>(*--ptr) & 0x7f;
441 *result = (*result << 7) | U(chunk);
444 U tmp = *result;
445 *result <<= 7;
446 if (rare(*result < tmp)) {
447 // Overflow.
448 return false;
450 *result |= U(static_cast<unsigned char>(*ptr) & 0x7f);
451 return true;
454 /** Append an encoded std::string to a string.
456 * @param s The string to append to.
457 * @param value The std::string to encode.
459 inline void
460 pack_string(std::string & s, const std::string & value)
462 pack_uint(s, value.size());
463 s += value;
466 /** Append an encoded C-style string to a string.
468 * @param s The string to append to.
469 * @param ptr The C-style string to encode.
471 inline void
472 pack_string(std::string & s, const char * ptr)
474 Assert(ptr);
475 size_t len = std::strlen(ptr);
476 pack_uint(s, len);
477 s.append(ptr, len);
480 /** Decode a std::string from a string.
482 * @param p Pointer to pointer to the current position in the string.
483 * @param end Pointer to the end of the string.
484 * @param result Where to store the result.
486 inline bool
487 unpack_string(const char ** p, const char * end, std::string & result)
489 size_t len;
490 if (rare(!unpack_uint(p, end, &len))) {
491 return false;
494 const char * & ptr = *p;
495 if (rare(len > size_t(end - ptr))) {
496 ptr = NULL;
497 return false;
500 result.assign(ptr, len);
501 ptr += len;
502 return true;
505 /** Append an encoded std::string to a string, preserving the sort order.
507 * The byte which follows this encoded value *must not* be \xff, or the sort
508 * order won't be correct. You may need to store a padding byte (\0 say) to
509 * ensure this. Note that pack_uint_preserving_sort() can never produce
510 * \xff as its first byte so is safe to use immediately afterwards.
512 * @param s The string to append to.
513 * @param value The std::string to encode.
514 * @param last If true, this is the last thing to be encoded in this
515 * string - see note below (default: false)
517 * It doesn't make sense to use pack_string_preserving_sort() if nothing can
518 * ever follow, but if optional items can, you can set last=true in cases
519 * where nothing does and get a shorter encoding in those cases.
521 inline void
522 pack_string_preserving_sort(std::string & s, const std::string & value,
523 bool last = false)
525 std::string::size_type b = 0, e;
526 while ((e = value.find('\0', b)) != std::string::npos) {
527 ++e;
528 s.append(value, b, e - b);
529 s += '\xff';
530 b = e;
532 s.append(value, b, std::string::npos);
533 if (!last) s += '\0';
536 /** Decode a "sort preserved" std::string from a string.
538 * The std::string must have been encoded with pack_string_preserving_sort().
540 * @param p Pointer to pointer to the current position in the string.
541 * @param end Pointer to the end of the string.
542 * @param result Where to store the result.
544 inline bool
545 unpack_string_preserving_sort(const char ** p, const char * end,
546 std::string & result)
548 result.resize(0);
550 const char *ptr = *p;
551 Assert(ptr);
553 while (ptr != end) {
554 char ch = *ptr++;
555 if (rare(ch == '\0')) {
556 if (usual(ptr == end || *ptr != '\xff')) {
557 break;
559 ++ptr;
561 result += ch;
563 *p = ptr;
564 return true;
567 inline std::string
568 pack_chert_postlist_key(const std::string &term)
570 // Special case for doclen lists.
571 if (term.empty())
572 return std::string("\x00\xe0", 2);
574 std::string key;
575 pack_string_preserving_sort(key, term, true);
576 return key;
579 inline std::string
580 pack_chert_postlist_key(const std::string &term, Xapian::docid did)
582 // Special case for doclen lists.
583 if (term.empty()) {
584 std::string key("\x00\xe0", 2);
585 C_pack_uint_preserving_sort(key, did);
586 return key;
589 std::string key;
590 pack_string_preserving_sort(key, term);
591 C_pack_uint_preserving_sort(key, did);
592 return key;
595 inline std::string
596 pack_glass_postlist_key(const std::string &term)
598 // Special case for doclen lists.
599 if (term.empty())
600 return std::string("\x00\xe0", 2);
602 std::string key;
603 pack_string_preserving_sort(key, term, true);
604 return key;
607 inline std::string
608 pack_glass_postlist_key(const std::string &term, Xapian::docid did)
610 // Special case for doclen lists.
611 if (term.empty()) {
612 std::string key("\x00\xe0", 2);
613 pack_uint_preserving_sort(key, did);
614 return key;
617 std::string key;
618 pack_string_preserving_sort(key, term);
619 pack_uint_preserving_sort(key, did);
620 return key;
623 #endif // XAPIAN_INCLUDED_PACK_H