2 * @brief Pack types into strings and unpack them again.
4 /* Copyright (C) 2009,2015 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_PACK_H
22 #define XAPIAN_INCLUDED_PACK_H
29 #include "xapian/types.h"
31 /** How many bits to store the length of a sortable uint in.
33 * Setting this to 2 limits us to 2**32 documents in the database. If set
34 * to 3, then 2**64 documents are possible, but the database format isn't
37 const unsigned int SORTABLE_UINT_LOG2_MAX_BYTES
= 2;
39 /// Calculated value used below.
40 const unsigned int SORTABLE_UINT_MAX_BYTES
= 1 << SORTABLE_UINT_LOG2_MAX_BYTES
;
42 /// Calculated value used below.
43 const unsigned int SORTABLE_UINT_1ST_BYTE_MASK
=
44 (0xffu
>> SORTABLE_UINT_LOG2_MAX_BYTES
);
46 /** Append an encoded bool to a string.
48 * @param s The string to append to.
49 * @param value The bool to encode.
52 pack_bool(std::string
& s
, bool value
)
54 s
+= char('0' | static_cast<char>(value
));
57 /** Decode a bool from a string.
59 * @param p Pointer to pointer to the current position in the string.
60 * @param end Pointer to the end of the string.
61 * @param result Where to store the result.
64 unpack_bool(const char ** p
, const char * end
, bool * result
)
67 const char * & ptr
= *p
;
70 if (rare(ptr
== end
|| ((ch
= *ptr
++ - '0') &~ 1))) {
74 *result
= static_cast<bool>(ch
);
78 /** Append an encoded unsigned integer to a string as the last item.
80 * This encoding is only suitable when this is the last thing encoded as
81 * the encoding used doesn't contain its own length.
83 * @param s The string to append to.
84 * @param value The unsigned integer to encode.
88 pack_uint_last(std::string
& s
, U value
)
90 // Check U is an unsigned type.
91 STATIC_ASSERT_UNSIGNED_TYPE(U
);
94 s
+= char(value
& 0xff);
99 /** Decode an unsigned integer as the last item in a string.
101 * @param p Pointer to pointer to the current position in the string.
102 * @param end Pointer to the end of the string.
103 * @param result Where to store the result.
107 unpack_uint_last(const char ** p
, const char * end
, U
* result
)
109 // Check U is an unsigned type.
110 STATIC_ASSERT_UNSIGNED_TYPE(U
);
113 const char * ptr
= *p
;
117 // Check for overflow.
118 if (rare(end
- ptr
> int(sizeof(U
)))) {
124 *result
= (*result
<< 8) | U(static_cast<unsigned char>(*--end
));
130 /** Append an encoded unsigned integer to a string, preserving the sort order.
134 * The appended string data will sort in the same order as the unsigned
135 * integer being encoded.
137 * Note that the first byte of the encoding will never be \xff, so it is
138 * safe to store the result of this function immediately after the result of
139 * pack_string_preserving_sort().
141 * @param s The string to append to.
142 * @param value The unsigned integer to encode.
146 C_pack_uint_preserving_sort(std::string
& s
, U value
)
148 // Check U is an unsigned type.
149 STATIC_ASSERT_UNSIGNED_TYPE(U
);
151 // FIXME: Doesn't work with 64-bit Xapian::docid, etc.
152 static_assert(sizeof(U
) <= SORTABLE_UINT_MAX_BYTES
,
153 "Template type U too wide for database format");
156 char tmp
[sizeof(U
) + 1];
157 char * p
= tmp
+ sizeof(tmp
);
160 *--p
= char(value
& 0xff);
162 } while (value
&~ SORTABLE_UINT_1ST_BYTE_MASK
);
164 unsigned char len
= static_cast<unsigned char>(tmp
+ sizeof(tmp
) - p
);
165 *--p
= char((len
- 1) << (8 - SORTABLE_UINT_LOG2_MAX_BYTES
) | value
);
166 s
.append(p
, len
+ 1);
167 Assert(s
[0] != '\xff');
170 /** Decode an "sort preserved" unsigned integer from a string.
174 * The unsigned integer must have been encoded with
175 * C_pack_uint_preserving_sort().
177 * @param p Pointer to pointer to the current position in the string.
178 * @param end Pointer to the end of the string.
179 * @param result Where to store the result.
183 C_unpack_uint_preserving_sort(const char ** p
, const char * end
, U
* result
)
185 // Check U is an unsigned type.
186 STATIC_ASSERT_UNSIGNED_TYPE(U
);
187 static_assert(sizeof(U
) < 256,
188 "Template type U too wide for database format");
191 const char * ptr
= *p
;
194 if (rare(ptr
== end
)) {
198 unsigned char len_byte
= static_cast<unsigned char>(*ptr
++);
199 *result
= len_byte
& SORTABLE_UINT_1ST_BYTE_MASK
;
200 size_t len
= (len_byte
>> (8 - SORTABLE_UINT_LOG2_MAX_BYTES
)) + 1;
202 if (rare(size_t(end
- ptr
) < len
)) {
209 // Check for overflow.
210 if (rare(len
> int(sizeof(U
)))) {
215 *result
= (*result
<< 8) | U(static_cast<unsigned char>(*ptr
++));
222 // GCC 3.4 added __builtin_clz() (with l and ll variants).
223 inline int do_clz(unsigned value
) { return __builtin_clz(value
); }
225 inline int do_clz(unsigned long value
) { return __builtin_clzl(value
); }
227 inline int do_clz(unsigned long long value
) { return __builtin_clzll(value
); }
232 /** Append an encoded unsigned integer to a string, preserving the sort order.
234 * [Glass and newer variant]
236 * The appended string data will sort in the same order as the unsigned
237 * integer being encoded.
239 * Note that the first byte of the encoding will never be \xff, so it is
240 * safe to store the result of this function immediately after the result of
241 * pack_string_preserving_sort().
243 * @param s The string to append to.
244 * @param value The unsigned integer to encode.
248 pack_uint_preserving_sort(std::string
& s
, U value
)
250 // Check U is an unsigned type.
251 STATIC_ASSERT_UNSIGNED_TYPE(U
);
252 static_assert(sizeof(U
) <= 8,
253 "Template type U too wide for database format");
254 // The clz() functions are undefined for 0, so handle the smallest band
255 // as a special case.
256 if (value
< 0x8000) {
257 s
.resize(s
.size() + 2);
258 s
[s
.size() - 2] = static_cast<unsigned char>(value
>> 8);
259 Assert(s
[s
.size() - 2] != '\xff');
260 s
[s
.size() - 1] = static_cast<unsigned char>(value
);
265 size_t len
= ((sizeof(U
) * 8 + 5) - do_clz(value
)) / 7;
268 for (U x
= value
>> 22; x
; x
>>= 7) ++len
;
270 unsigned mask
= 0xff << (10 - len
);
272 s
.resize(s
.size() + len
);
273 for (size_t i
= 1; i
!= len
; ++i
) {
274 s
[s
.size() - i
] = static_cast<unsigned char>(value
);
278 s
[s
.size() - len
] = static_cast<unsigned char>(value
| mask
);
279 Assert(s
[s
.size() - len
] != '\xff');
281 AssertRel(len
, >, 2);
282 AssertRel(len
, <=, 9);
285 /** Decode an "sort preserved" unsigned integer from a string.
287 * [Glass and newer variant]
289 * The unsigned integer must have been encoded with
290 * pack_uint_preserving_sort().
292 * @param p Pointer to pointer to the current position in the string.
293 * @param end Pointer to the end of the string.
294 * @param result Where to store the result.
298 unpack_uint_preserving_sort(const char ** p
, const char * end
, U
* result
)
300 // Check U is an unsigned type.
301 STATIC_ASSERT_UNSIGNED_TYPE(U
);
302 static_assert(sizeof(U
) <= 8,
303 "Template type U too wide for database format");
306 const char * ptr
= *p
;
309 if (rare(ptr
== end
)) {
313 unsigned char len_byte
= static_cast<unsigned char>(*ptr
++);
314 if (len_byte
< 0x80) {
315 *result
= (U(len_byte
) << 8) | static_cast<unsigned char>(*ptr
++);
320 if (len_byte
== 0xff) {
324 // len is how many bytes there are after the length byte.
326 size_t len
= do_clz(len_byte
^ 0xffu
) + 9 - sizeof(unsigned) * 8;
329 for (unsigned char m
= 0x40; len_byte
& m
; m
>>= 1) ++len
;
331 if (rare(size_t(end
- ptr
) < len
)) {
334 unsigned mask
= 0xff << (9 - len
);
337 // Check for overflow.
338 if (rare(len
> int(sizeof(U
)))) return false;
339 if (sizeof(U
) != 8) {
340 // Need to check the top byte too.
341 if (rare(len
== int(sizeof(U
)) && len_byte
!= 0)) return false;
349 r
= (r
<< 8) | U(static_cast<unsigned char>(*ptr
++));
356 /** Append an encoded unsigned integer to a string.
358 * @param s The string to append to.
359 * @param value The unsigned integer to encode.
363 pack_uint(std::string
& s
, U value
)
365 // Check U is an unsigned type.
366 STATIC_ASSERT_UNSIGNED_TYPE(U
);
368 while (value
>= 128) {
369 s
+= static_cast<char>(static_cast<unsigned char>(value
) | 0x80);
372 s
+= static_cast<char>(value
);
375 /** Append an encoded unsigned integer (bool type) to a string.
377 * @param s The string to append to.
378 * @param value The unsigned integer to encode.
382 pack_uint(std::string
& s
, bool value
)
384 s
+= static_cast<char>(value
);
387 /** Decode an unsigned integer from a string.
389 * @param p Pointer to pointer to the current position in the string.
390 * @param end Pointer to the end of the string.
391 * @param result Where to store the result (or NULL to just skip it).
395 unpack_uint(const char ** p
, const char * end
, U
* result
)
397 // Check U is an unsigned type.
398 STATIC_ASSERT_UNSIGNED_TYPE(U
);
400 const char * ptr
= *p
;
402 const char * start
= ptr
;
404 // Check the length of the encoded integer first.
406 if (rare(ptr
== end
)) {
411 } while (static_cast<unsigned char>(*ptr
++) >= 128);
415 if (!result
) return true;
419 // Special case for small values.
423 size_t maxbits
= size_t(ptr
- start
) * 7;
424 if (maxbits
<= sizeof(U
) * 8) {
425 // No possibility of overflow.
427 unsigned char chunk
= static_cast<unsigned char>(*--ptr
) & 0x7f;
428 *result
= (*result
<< 7) | U(chunk
);
429 } while (ptr
!= start
);
433 size_t minbits
= maxbits
- 6;
434 if (rare(minbits
> sizeof(U
) * 8)) {
439 while (--ptr
!= start
) {
440 unsigned char chunk
= static_cast<unsigned char>(*--ptr
) & 0x7f;
441 *result
= (*result
<< 7) | U(chunk
);
446 if (rare(*result
< tmp
)) {
450 *result
|= U(static_cast<unsigned char>(*ptr
) & 0x7f);
454 /** Append an encoded std::string to a string.
456 * @param s The string to append to.
457 * @param value The std::string to encode.
460 pack_string(std::string
& s
, const std::string
& value
)
462 pack_uint(s
, value
.size());
466 /** Append an encoded C-style string to a string.
468 * @param s The string to append to.
469 * @param ptr The C-style string to encode.
472 pack_string(std::string
& s
, const char * ptr
)
475 size_t len
= std::strlen(ptr
);
480 /** Decode a std::string from a string.
482 * @param p Pointer to pointer to the current position in the string.
483 * @param end Pointer to the end of the string.
484 * @param result Where to store the result.
487 unpack_string(const char ** p
, const char * end
, std::string
& result
)
490 if (rare(!unpack_uint(p
, end
, &len
))) {
494 const char * & ptr
= *p
;
495 if (rare(len
> size_t(end
- ptr
))) {
500 result
.assign(ptr
, len
);
505 /** Append an encoded std::string to a string, preserving the sort order.
507 * The byte which follows this encoded value *must not* be \xff, or the sort
508 * order won't be correct. You may need to store a padding byte (\0 say) to
509 * ensure this. Note that pack_uint_preserving_sort() can never produce
510 * \xff as its first byte so is safe to use immediately afterwards.
512 * @param s The string to append to.
513 * @param value The std::string to encode.
514 * @param last If true, this is the last thing to be encoded in this
515 * string - see note below (default: false)
517 * It doesn't make sense to use pack_string_preserving_sort() if nothing can
518 * ever follow, but if optional items can, you can set last=true in cases
519 * where nothing does and get a shorter encoding in those cases.
522 pack_string_preserving_sort(std::string
& s
, const std::string
& value
,
525 std::string::size_type b
= 0, e
;
526 while ((e
= value
.find('\0', b
)) != std::string::npos
) {
528 s
.append(value
, b
, e
- b
);
532 s
.append(value
, b
, std::string::npos
);
533 if (!last
) s
+= '\0';
536 /** Decode a "sort preserved" std::string from a string.
538 * The std::string must have been encoded with pack_string_preserving_sort().
540 * @param p Pointer to pointer to the current position in the string.
541 * @param end Pointer to the end of the string.
542 * @param result Where to store the result.
545 unpack_string_preserving_sort(const char ** p
, const char * end
,
546 std::string
& result
)
550 const char *ptr
= *p
;
555 if (rare(ch
== '\0')) {
556 if (usual(ptr
== end
|| *ptr
!= '\xff')) {
568 pack_chert_postlist_key(const std::string
&term
)
570 // Special case for doclen lists.
572 return std::string("\x00\xe0", 2);
575 pack_string_preserving_sort(key
, term
, true);
580 pack_chert_postlist_key(const std::string
&term
, Xapian::docid did
)
582 // Special case for doclen lists.
584 std::string
key("\x00\xe0", 2);
585 C_pack_uint_preserving_sort(key
, did
);
590 pack_string_preserving_sort(key
, term
);
591 C_pack_uint_preserving_sort(key
, did
);
596 pack_glass_postlist_key(const std::string
&term
)
598 // Special case for doclen lists.
600 return std::string("\x00\xe0", 2);
603 pack_string_preserving_sort(key
, term
, true);
608 pack_glass_postlist_key(const std::string
&term
, Xapian::docid did
)
610 // Special case for doclen lists.
612 std::string
key("\x00\xe0", 2);
613 pack_uint_preserving_sort(key
, did
);
618 pack_string_preserving_sort(key
, term
);
619 pack_uint_preserving_sort(key
, did
);
623 #endif // XAPIAN_INCLUDED_PACK_H