Fix masking of bits in serialised query
[xapian.git] / xapian-applications / omega / hashterm.cc
blobf064a473fe7adafe256df4d40b6dcbbbdd90f9a8
1 /* hashterm.cc: generate a URL term, truncating and hashing very long URLs.
3 * Copyright (C) 2003 Lemur Consulting Ltd.
4 * Copyright (C) 2003,2004,2006,2011 Olly Betts
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
22 #include <config.h>
23 #include "hashterm.h"
25 using namespace std;
27 /* Hash is computed as an unsigned long, and then converted to a
28 * string by writing 6 bits of it to each output byte. So length is
29 * ceil(4 * 8 / 6) (we use 4 rather than sizeof(unsigned long) so
30 * that the hash is the same regardless of the platform).
32 const unsigned int HASH_LEN = ((4 * 8 + 5) / 6);
34 /* Make a hash of a string - this isn't a very good hashing algorithm, but
35 * it's fast. A collision would result in a document overwriting a different
36 * document, which is not desirable, but also wouldn't be a total disaster.
38 static string
39 hash_string(const string &s)
41 unsigned long int h = 1;
42 for (string::const_iterator i = s.begin(); i != s.end(); ++i) {
43 h += (h << 5) + static_cast<unsigned char>(*i);
45 h &= 0xffffffff; // In case sizeof(unsigned long) > 4
46 // FIXME: It's quirky that we make leading zeros ' ' here, but "embedded"
47 // zeros become char(33) below. Not a problem, but perhaps change ' ' to
48 // char(33) if we need to break backwards compatibility for some other
49 // reason.
50 string result(HASH_LEN, ' ');
51 size_t j = 0;
52 while (h != 0) {
53 char ch = char((h & 63) + 33);
54 result[j++] = ch;
55 h = h >> 6;
57 return result;
60 string
61 hash_long_term(const string &term, unsigned int max_length)
63 if (term.length() <= max_length) return term;
64 string result(term);
65 max_length -= HASH_LEN;
66 result.replace(max_length, string::npos,
67 hash_string(result.substr(max_length)));
68 return result;