2 * @brief Pack types into strings and unpack them again.
4 /* Copyright (C) 2009,2015,2016,2017 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #ifndef XAPIAN_INCLUDED_PACK_H
22 #define XAPIAN_INCLUDED_PACK_H
26 #include <type_traits>
30 #include "xapian/types.h"
32 /** How many bits to store the length of a sortable uint in.
34 * Setting this to 2 limits us to 2**32 documents in the database. If set
35 * to 3, then 2**64 documents are possible, but the database format isn't
38 const unsigned int SORTABLE_UINT_LOG2_MAX_BYTES
= 2;
40 /// Calculated value used below.
41 const unsigned int SORTABLE_UINT_MAX_BYTES
= 1 << SORTABLE_UINT_LOG2_MAX_BYTES
;
43 /// Calculated value used below.
44 const unsigned int SORTABLE_UINT_1ST_BYTE_MASK
=
45 (0xffu
>> SORTABLE_UINT_LOG2_MAX_BYTES
);
47 /** Append an encoded bool to a string.
49 * @param s The string to append to.
50 * @param value The bool to encode.
53 pack_bool(std::string
& s
, bool value
)
55 s
+= char('0' | static_cast<char>(value
));
58 /** Decode a bool from a string.
60 * @param p Pointer to pointer to the current position in the string.
61 * @param end Pointer to the end of the string.
62 * @param result Where to store the result.
65 unpack_bool(const char ** p
, const char * end
, bool * result
)
68 const char * & ptr
= *p
;
71 if (rare(ptr
== end
|| ((ch
= *ptr
++ - '0') &~ 1))) {
75 *result
= static_cast<bool>(ch
);
79 /** Append an encoded unsigned integer to a string as the last item.
81 * This encoding is only suitable when this is the last thing encoded as
82 * the encoding used doesn't contain its own length.
84 * @param s The string to append to.
85 * @param value The unsigned integer to encode.
89 pack_uint_last(std::string
& s
, U value
)
91 static_assert(std::is_unsigned
<U
>::value
, "Unsigned type required");
94 s
+= char(value
& 0xff);
99 /** Decode an unsigned integer as the last item in a string.
101 * @param p Pointer to pointer to the current position in the string.
102 * @param end Pointer to the end of the string.
103 * @param result Where to store the result.
107 unpack_uint_last(const char ** p
, const char * end
, U
* result
)
109 static_assert(std::is_unsigned
<U
>::value
, "Unsigned type required");
112 const char * ptr
= *p
;
116 // Check for overflow.
117 if (rare(end
- ptr
> int(sizeof(U
)))) {
123 *result
= (*result
<< 8) | U(static_cast<unsigned char>(*--end
));
129 #if HAVE_DECL___BUILTIN_CLZ && \
130 HAVE_DECL___BUILTIN_CLZL && \
131 HAVE_DECL___BUILTIN_CLZLL
132 inline int do_clz(unsigned value
) { return __builtin_clz(value
); }
134 inline int do_clz(unsigned long value
) { return __builtin_clzl(value
); }
136 inline int do_clz(unsigned long long value
) { return __builtin_clzll(value
); }
141 /** Append an encoded unsigned integer to a string, preserving the sort order.
143 * The appended string data will sort in the same order as the unsigned
144 * integer being encoded.
146 * Note that the first byte of the encoding will never be \xff, so it is
147 * safe to store the result of this function immediately after the result of
148 * pack_string_preserving_sort().
150 * @param s The string to append to.
151 * @param value The unsigned integer to encode.
155 pack_uint_preserving_sort(std::string
& s
, U value
)
157 static_assert(std::is_unsigned
<U
>::value
, "Unsigned type required");
158 static_assert(sizeof(U
) <= 8,
159 "Template type U too wide for database format");
160 // The clz() functions are undefined for 0, so handle the smallest band
161 // as a special case.
162 if (value
< 0x8000) {
163 s
.resize(s
.size() + 2);
164 s
[s
.size() - 2] = static_cast<unsigned char>(value
>> 8);
165 Assert(s
[s
.size() - 2] != '\xff');
166 s
[s
.size() - 1] = static_cast<unsigned char>(value
);
171 size_t len
= ((sizeof(U
) * 8 + 5) - do_clz(value
)) / 7;
174 for (U x
= value
>> 22; x
; x
>>= 7) ++len
;
176 unsigned mask
= 0xff << (10 - len
);
178 s
.resize(s
.size() + len
);
179 for (size_t i
= 1; i
!= len
; ++i
) {
180 s
[s
.size() - i
] = static_cast<unsigned char>(value
);
184 s
[s
.size() - len
] = static_cast<unsigned char>(value
| mask
);
185 Assert(s
[s
.size() - len
] != '\xff');
187 AssertRel(len
, >, 2);
188 AssertRel(len
, <=, 9);
191 /** Decode an "sort preserved" unsigned integer from a string.
193 * The unsigned integer must have been encoded with
194 * pack_uint_preserving_sort().
196 * @param p Pointer to pointer to the current position in the string.
197 * @param end Pointer to the end of the string.
198 * @param result Where to store the result.
202 unpack_uint_preserving_sort(const char ** p
, const char * end
, U
* result
)
204 static_assert(std::is_unsigned
<U
>::value
, "Unsigned type required");
205 static_assert(sizeof(U
) <= 8,
206 "Template type U too wide for database format");
209 const char * ptr
= *p
;
212 if (rare(ptr
== end
)) {
216 unsigned char len_byte
= static_cast<unsigned char>(*ptr
++);
217 if (len_byte
< 0x80) {
218 *result
= (U(len_byte
) << 8) | static_cast<unsigned char>(*ptr
++);
223 if (len_byte
== 0xff) {
227 // len is how many bytes there are after the length byte.
229 size_t len
= do_clz(len_byte
^ 0xffu
) + 9 - sizeof(unsigned) * 8;
232 for (unsigned char m
= 0x40; len_byte
& m
; m
>>= 1) ++len
;
234 if (rare(size_t(end
- ptr
) < len
)) {
237 unsigned mask
= 0xff << (9 - len
);
240 // Check for overflow.
241 if (rare(len
> int(sizeof(U
)))) return false;
242 if (sizeof(U
) != 8) {
243 // Need to check the top byte too.
244 if (rare(len
== int(sizeof(U
)) && len_byte
!= 0)) return false;
252 r
= (r
<< 8) | U(static_cast<unsigned char>(*ptr
++));
259 /** Append an encoded unsigned integer to a string.
261 * @param s The string to append to.
262 * @param value The unsigned integer to encode.
266 pack_uint(std::string
& s
, U value
)
268 static_assert(std::is_unsigned
<U
>::value
, "Unsigned type required");
270 while (value
>= 128) {
271 s
+= static_cast<char>(static_cast<unsigned char>(value
) | 0x80);
274 s
+= static_cast<char>(value
);
277 /** Append an encoded unsigned integer (bool type) to a string.
279 * @param s The string to append to.
280 * @param value The unsigned integer to encode.
284 pack_uint(std::string
& s
, bool value
)
286 s
+= static_cast<char>(value
);
289 /** Decode an unsigned integer from a string.
291 * @param p Pointer to pointer to the current position in the string.
292 * @param end Pointer to the end of the string.
293 * @param result Where to store the result (or NULL to just skip it).
297 unpack_uint(const char ** p
, const char * end
, U
* result
)
299 static_assert(std::is_unsigned
<U
>::value
, "Unsigned type required");
301 const char * ptr
= *p
;
303 const char * start
= ptr
;
305 // Check the length of the encoded integer first.
307 if (rare(ptr
== end
)) {
312 } while (static_cast<unsigned char>(*ptr
++) >= 128);
316 if (!result
) return true;
320 // Special case for small values.
324 size_t maxbits
= size_t(ptr
- start
) * 7;
325 if (maxbits
<= sizeof(U
) * 8) {
326 // No possibility of overflow.
328 unsigned char chunk
= static_cast<unsigned char>(*--ptr
) & 0x7f;
329 *result
= (*result
<< 7) | U(chunk
);
330 } while (ptr
!= start
);
334 size_t minbits
= maxbits
- 6;
335 if (rare(minbits
> sizeof(U
) * 8)) {
340 while (--ptr
!= start
) {
341 unsigned char chunk
= static_cast<unsigned char>(*--ptr
) & 0x7f;
342 *result
= (*result
<< 7) | U(chunk
);
347 if (rare(*result
< tmp
)) {
351 *result
|= U(static_cast<unsigned char>(*ptr
) & 0x7f);
355 /** Append an encoded std::string to a string.
357 * @param s The string to append to.
358 * @param value The std::string to encode.
361 pack_string(std::string
& s
, const std::string
& value
)
363 pack_uint(s
, value
.size());
367 /** Append an encoded C-style string to a string.
369 * @param s The string to append to.
370 * @param ptr The C-style string to encode.
373 pack_string(std::string
& s
, const char * ptr
)
376 size_t len
= std::strlen(ptr
);
381 /** Decode a std::string from a string.
383 * @param p Pointer to pointer to the current position in the string.
384 * @param end Pointer to the end of the string.
385 * @param result Where to store the result.
388 unpack_string(const char ** p
, const char * end
, std::string
& result
)
391 if (rare(!unpack_uint(p
, end
, &len
))) {
395 const char * & ptr
= *p
;
396 if (rare(len
> size_t(end
- ptr
))) {
401 result
.assign(ptr
, len
);
406 /** Append an encoded std::string to a string, preserving the sort order.
408 * The byte which follows this encoded value *must not* be \xff, or the sort
409 * order won't be correct. You may need to store a padding byte (\0 say) to
410 * ensure this. Note that pack_uint_preserving_sort() can never produce
411 * \xff as its first byte so is safe to use immediately afterwards.
413 * @param s The string to append to.
414 * @param value The std::string to encode.
415 * @param last If true, this is the last thing to be encoded in this
416 * string - see note below (default: false)
418 * It doesn't make sense to use pack_string_preserving_sort() if nothing can
419 * ever follow, but if optional items can, you can set last=true in cases
420 * where nothing does and get a shorter encoding in those cases.
423 pack_string_preserving_sort(std::string
& s
, const std::string
& value
,
426 std::string::size_type b
= 0, e
;
427 while ((e
= value
.find('\0', b
)) != std::string::npos
) {
429 s
.append(value
, b
, e
- b
);
433 s
.append(value
, b
, std::string::npos
);
434 if (!last
) s
+= '\0';
437 /** Decode a "sort preserved" std::string from a string.
439 * The std::string must have been encoded with pack_string_preserving_sort().
441 * @param p Pointer to pointer to the current position in the string.
442 * @param end Pointer to the end of the string.
443 * @param result Where to store the result.
446 unpack_string_preserving_sort(const char ** p
, const char * end
,
447 std::string
& result
)
451 const char *ptr
= *p
;
456 if (rare(ch
== '\0')) {
457 if (usual(ptr
== end
|| *ptr
!= '\xff')) {
469 pack_glass_postlist_key(const std::string
&term
)
471 // Special case for doclen lists.
473 return std::string("\x00\xe0", 2);
476 pack_string_preserving_sort(key
, term
, true);
481 pack_glass_postlist_key(const std::string
&term
, Xapian::docid did
)
483 // Special case for doclen lists.
485 std::string
key("\x00\xe0", 2);
486 pack_uint_preserving_sort(key
, did
);
491 pack_string_preserving_sort(key
, term
);
492 pack_uint_preserving_sort(key
, did
);
496 #endif // XAPIAN_INCLUDED_PACK_H