Document xapian-compact --blocksize takes an argument
[xapian.git] / xapian-core / api / sortable-serialise.cc
blob919d2d32fc0a15a3baa0a3946e2e92ef755e5e19
1 /** @file sortable-serialise.cc
2 * @brief Serialise floating point values to string which sort the same way.
3 */
4 /* Copyright (C) 2007,2009,2015 Olly Betts
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <config.h>
23 #ifndef XAPIAN_UNITTEST
24 # include <xapian/queryparser.h>
25 // Only enable exceptions for the testsuite - we don't want them in a library
26 // build as these functions are marked not to throw exceptions in the API
27 // headers.
28 # define UNITTEST_ASSERT(X)
29 #else
30 # include "omassert.h"
31 # define UNITTEST_ASSERT(X) Assert(X)
32 #endif
34 #include <cfloat>
35 #include <cmath>
36 #include <cstring>
38 #include <string>
40 using namespace std;
42 #if FLT_RADIX != 2
43 # error Code currently assumes FLT_RADIX == 2
44 #endif
46 #ifdef _MSC_VER
47 // Disable warning about negating an unsigned type, which we do deliberately.
48 # pragma warning(disable:4146)
49 #endif
51 size_t
52 Xapian::sortable_serialise_(double value, char * buf) XAPIAN_NOEXCEPT
54 double mantissa;
55 int exponent;
57 // Negative infinity.
58 if (value < -DBL_MAX) return 0;
60 mantissa = frexp(value, &exponent);
62 /* Deal with zero specially.
64 * IEEE representation of doubles uses 11 bits for the exponent, with a
65 * bias of 1023. We bias this by subtracting 8, and non-IEEE
66 * representations may allow higher exponents, so allow exponents down to
67 * -2039 - if smaller exponents are possible anywhere, we underflow such
68 * numbers to 0.
70 if (mantissa == 0.0 || exponent < -2039) {
71 *buf = '\x80';
72 return 1;
75 bool negative = (mantissa < 0);
76 if (negative) mantissa = -mantissa;
78 // Infinity, or extremely large non-IEEE representation.
79 if (value > DBL_MAX || exponent > 2055) {
80 if (negative) {
81 // This can only happen with a non-IEEE representation, because
82 // we've already tested for value < -DBL_MAX
83 return 0;
84 } else {
85 memset(buf, '\xff', 9);
86 return 9;
90 // Encoding:
92 // [ 7 | 6 | 5 | 4 3 2 1 0]
93 // Sm Se Le
95 // Sm stores the sign of the mantissa: 1 = positive or zero, 0 = negative.
96 // Se stores the sign of the exponent: Sm for positive/zero, !Sm for neg.
97 // Le stores the length of the exponent: !Se for 3 bits, Se for 11 bits.
98 unsigned char next = (negative ? 0 : 0xe0);
100 // Bias the exponent by 8 so that more small integers get short encodings.
101 exponent -= 8;
102 bool exponent_negative = (exponent < 0);
103 if (exponent_negative) {
104 exponent = -exponent;
105 next ^= 0x60;
108 size_t len = 0;
110 /* We store the exponent in 3 or 11 bits. If the number is negative, we
111 * flip all the bits of the exponent, since larger negative numbers should
112 * sort first.
114 * If the exponent is negative, we flip the bits of the exponent, since
115 * larger negative exponents should sort first (unless the number is
116 * negative, in which case they should sort later).
118 UNITTEST_ASSERT(exponent >= 0);
119 if (exponent < 8) {
120 next ^= 0x20;
121 next |= static_cast<unsigned char>(exponent << 2);
122 if (negative ^ exponent_negative) next ^= 0x1c;
123 } else {
124 UNITTEST_ASSERT((exponent >> 11) == 0);
125 // Put the top 5 bits of the exponent into the lower 5 bits of the
126 // first byte:
127 next |= static_cast<unsigned char>(exponent >> 6);
128 if (negative ^ exponent_negative) next ^= 0x1f;
129 buf[len++] = next;
130 // And the lower 6 bits of the exponent go into the upper 6 bits
131 // of the second byte:
132 next = static_cast<unsigned char>(exponent) << 2;
133 if (negative ^ exponent_negative) next ^= 0xfc;
136 // Convert the 52 (or 53) bits of the mantissa into two 32-bit words.
137 mantissa *= 1 << (negative ? 26 : 27);
138 unsigned word1 = static_cast<unsigned>(mantissa);
139 mantissa -= word1;
140 unsigned word2 = static_cast<unsigned>(mantissa * 4294967296.0); // 1<<32
141 // If the number is positive, the first bit will always be set because 0.5
142 // <= mantissa < 1, unless mantissa is zero, which we handle specially
143 // above). If the number is negative, we negate the mantissa instead of
144 // flipping all the bits, so in the case of 0.5, the first bit isn't set
145 // so we need to store it explicitly. But for the cost of one extra
146 // leading bit, we can save several trailing 0xff bytes in lots of common
147 // cases.
148 UNITTEST_ASSERT(negative || (word1 & (1<<26)));
149 if (negative) {
150 // We negate the mantissa for negative numbers, so that the sort order
151 // is reversed (since larger negative numbers should come first).
152 word1 = -word1;
153 if (word2 != 0) ++word1;
154 word2 = -word2;
157 word1 &= 0x03ffffff;
158 next |= static_cast<unsigned char>(word1 >> 24);
159 buf[len++] = next;
160 buf[len++] = char(word1 >> 16);
161 buf[len++] = char(word1 >> 8);
162 buf[len++] = char(word1);
164 buf[len++] = char(word2 >> 24);
165 buf[len++] = char(word2 >> 16);
166 buf[len++] = char(word2 >> 8);
167 buf[len++] = char(word2);
169 // Finally, we can chop off any trailing zero bytes.
170 while (len > 0 && buf[len - 1] == '\0') {
171 --len;
174 return len;
177 /// Get a number from the character at a given position in a string, returning
178 /// 0 if the string isn't long enough.
179 static inline unsigned char
180 numfromstr(const std::string & str, std::string::size_type pos)
182 return (pos < str.size()) ? static_cast<unsigned char>(str[pos]) : '\0';
185 double
186 Xapian::sortable_unserialise(const std::string & value) XAPIAN_NOEXCEPT
188 // Zero.
189 if (value.size() == 1 && value[0] == '\x80') return 0.0;
191 // Positive infinity.
192 if (value.size() == 9 &&
193 memcmp(value.data(), "\xff\xff\xff\xff\xff\xff\xff\xff\xff", 9) == 0) {
194 #ifdef INFINITY
195 // INFINITY is C99. Oddly, it's of type "float" so sanity check in
196 // case it doesn't cast to double as infinity (apparently some
197 // implementations have this problem).
198 if (double(INFINITY) > HUGE_VAL) return INFINITY;
199 #endif
200 return HUGE_VAL;
203 // Negative infinity.
204 if (value.empty()) {
205 #ifdef INFINITY
206 if (double(INFINITY) > HUGE_VAL) return -INFINITY;
207 #endif
208 return -HUGE_VAL;
211 unsigned char first = numfromstr(value, 0);
212 size_t i = 0;
214 first ^= static_cast<unsigned char>(first & 0xc0) >> 1;
215 bool negative = !(first & 0x80);
216 bool exponent_negative = (first & 0x40);
217 bool explen = !(first & 0x20);
218 int exponent = first & 0x1f;
219 if (!explen) {
220 exponent >>= 2;
221 if (negative ^ exponent_negative) exponent ^= 0x07;
222 } else {
223 first = numfromstr(value, ++i);
224 exponent <<= 6;
225 exponent |= (first >> 2);
226 if (negative ^ exponent_negative) exponent ^= 0x07ff;
229 unsigned word1;
230 word1 = (unsigned(first & 0x03) << 24);
231 word1 |= numfromstr(value, ++i) << 16;
232 word1 |= numfromstr(value, ++i) << 8;
233 word1 |= numfromstr(value, ++i);
235 unsigned word2 = 0;
236 if (i < value.size()) {
237 word2 = numfromstr(value, ++i) << 24;
238 word2 |= numfromstr(value, ++i) << 16;
239 word2 |= numfromstr(value, ++i) << 8;
240 word2 |= numfromstr(value, ++i);
243 if (negative) {
244 word1 = -word1;
245 if (word2 != 0) ++word1;
246 word2 = -word2;
247 UNITTEST_ASSERT((word1 & 0xf0000000) != 0);
248 word1 &= 0x03ffffff;
250 if (!negative) word1 |= 1<<26;
252 double mantissa = 0;
253 if (word2) mantissa = word2 / 4294967296.0; // 1<<32
254 mantissa += word1;
255 mantissa /= 1 << (negative ? 26 : 27);
257 if (exponent_negative) exponent = -exponent;
258 exponent += 8;
260 if (negative) mantissa = -mantissa;
262 // We use scalbn() since it's equivalent to ldexp() when FLT_RADIX == 2
263 // (which we currently assume), except that ldexp() will set errno if the
264 // result overflows or underflows, which isn't really desirable here.
265 return scalbn(mantissa, exponent);