dlzma: added fix for very rare out-of-bounds write in LZMA encoder (it can happen...
[iv.d.git] / hash / testhash.d
blob1fb7a562bf0f7dfef6e7fea6da46b05e52a212c5
1 /* Invisible Vector Library
2 * coded by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
3 * Understanding is not required. Only obedience.
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, version 3 of the License ONLY.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 module testhash is aliced;
19 import std.traits;
21 import iv.hash.siphash;
22 import iv.hash.murhash;
23 import iv.hash.fasthash;
24 import iv.hash.joaat;
27 version(use_custom_dict) {
28 enum WordsFile = "_00/zwords.txt";
29 } else {
30 enum WordsFile = "/usr/share/dict/words";
34 bool[string] words;
37 void loadWords () {
38 import std.stdio : File;
40 void addWord (const(char)[] w) {
41 if (w.length == 0 || w in words) return;
42 words[w.idup] = true;
45 foreach (auto line; File(WordsFile).byLine) {
46 import std.string : strip;
47 line = line.strip;
48 addWord(line);
49 // all lower
50 foreach (ref ch; line) if (ch >= 'A' && ch <= 'Z') ch += 32;
51 addWord(line);
52 // all upper
53 foreach (ref ch; line) if (ch >= 'a' && ch <= 'z') ch -= 32;
54 addWord(line);
55 // first upper
56 if (line[0] >= 'A' && line[0] <= 'Z') {
57 line[0] += 32;
58 addWord(line);
64 void testHash(T) (string name, T hfn)
65 if (isCallable!T && (is(ReturnType!T == uint) || is(ReturnType!T == ulong)))
67 { import std.stdio : stdout; stdout.write(name, " ... "); stdout.flush(); }
68 alias HType = ReturnType!T;
69 usize[HType] collisions;
71 foreach (immutable w; words.byKey) {
72 static if (__traits(compiles, { auto h = hfn(w); })) {
73 auto h = hfn(w);
74 } else {
75 auto h = hfn(w, 0);
77 if (auto cc = h in collisions) {
78 ++(*cc);
79 } else {
80 collisions[h] = 1;
84 usize maxCollision = 0, elements = 0, colls = 0;
85 foreach (immutable c; collisions.byValue) {
86 if (c > maxCollision) maxCollision = c;
87 if (c) ++elements;
88 if (c > 1) colls += c-1;
90 assert(maxCollision >= 1);
92 if (maxCollision == 1) {
93 import std.stdio;
94 writeln("perfect for ", words.length, " words!");
95 } else {
96 import std.algorithm : sort;
97 import std.stdio;
98 import std.array;
99 writeln(maxCollision-1, " max collisions for ", words.length, " words, ", colls, " collisions total");
100 auto cols = sort!"a>b"(collisions.values).array;
101 cols ~= 0;
102 usize idx = 0;
103 //assert(cols[0] > 1);
104 while (cols[idx] > 1) {
105 uint count = 0;
106 auto c = cols[idx];
107 while (cols[idx] == c) { ++count; ++idx; }
108 writeln(" ", count, " collisions for ", c, " times");
114 uint innerhash (const(void)[] kbuf) @trusted nothrow @nogc {
115 int res;
116 if (kbuf.length == int.sizeof) {
117 import core.stdc.string : memcpy;
118 memcpy(&res, kbuf.ptr, res.sizeof);
119 } else {
120 res = 751;
122 foreach (immutable bt; cast(const(ubyte)[])kbuf) res = res*31+bt;
123 return (res*87767623)&int.max;
127 uint outerhash (const(void)[] kbuf) @trusted nothrow @nogc {
128 int res = 774831917;
129 foreach_reverse (immutable bt; cast(const(ubyte)[])kbuf) res = res*29+bt;
130 return (res*5157883)&int.max;
134 uint djb2 (const(char)[] str) {
135 uint hash = 5381;
136 auto data = cast(const(ubyte)[])str;
137 foreach (ubyte c; data) hash = ((hash << 5)+hash)+c; /* hash * 33 + c */
138 auto len = str.length;
139 while (len != 0) {
140 hash = ((hash << 5)+hash)+(len&0xff); /* hash * 33 + c */
141 len >>= 8;
143 return hash;
147 uint djb2x (const(char)[] str) {
148 uint hash = 5381;
149 auto data = cast(const(ubyte)[])str;
150 foreach (ubyte c; data) hash = ((hash << 5)+hash)^c; /* hash * 33 ^ c */
151 auto len = str.length;
152 while (len != 0) {
153 hash = ((hash << 5)+hash)^(len&0xff); /* hash * 33 ^ c */
154 len >>= 8;
156 return hash;
160 uint sdbm (const(char)[] str) {
161 uint hash = cast(uint)str.length;
162 auto data = cast(const(ubyte)[])str;
163 foreach (ubyte c; data) hash = c + (hash << 6) + (hash << 16) - hash;
164 return hash;
168 uint loselose (const(char)[] str) {
169 uint hash = 0;
170 auto data = cast(const(ubyte)[])str;
171 foreach (ubyte c; data) hash += c;
172 return hash;
176 uint sfHash (const(char)[] str) {
177 return SFhashOf(str.ptr, str.length);
181 usize SFhashOf( const (void)* buf, usize len, usize seed = 0 ) @trusted pure nothrow
184 * This is Paul Hsieh's SuperFastHash algorithm, described here:
185 * http://www.azillionmonkeys.com/qed/hash.html
186 * It is protected by the following open source license:
187 * http://www.azillionmonkeys.com/qed/weblicense.html
189 static uint get16bits( const (ubyte)* x ) pure nothrow
191 // CTFE doesn't support casting ubyte* -> ushort*, so revert to
192 // per-byte access when in CTFE.
193 version( HasUnalignedOps )
195 if (!__ctfe)
196 return *cast(ushort*) x;
199 return ((cast(uint) x[1]) << 8) + (cast(uint) x[0]);
202 // NOTE: SuperFastHash normally starts with a zero hash value. The seed
203 // value was incorporated to allow chaining.
204 auto data = cast(const (ubyte)*) buf;
205 auto hash = seed;
206 int rem;
208 if( len <= 0 || data is null )
209 return 0;
211 rem = len & 3;
212 len >>= 2;
214 for( ; len > 0; len-- )
216 hash += get16bits( data );
217 auto tmp = (get16bits( data + 2 ) << 11) ^ hash;
218 hash = (hash << 16) ^ tmp;
219 data += 2 * ushort.sizeof;
220 hash += hash >> 11;
223 switch( rem )
225 case 3: hash += get16bits( data );
226 hash ^= hash << 16;
227 hash ^= data[ushort.sizeof] << 18;
228 hash += hash >> 11;
229 break;
230 case 2: hash += get16bits( data );
231 hash ^= hash << 11;
232 hash += hash >> 17;
233 break;
234 case 1: hash += *data;
235 hash ^= hash << 10;
236 hash += hash >> 1;
237 break;
238 default:
239 break;
242 /* Force "avalanching" of final 127 bits */
243 hash ^= hash << 3;
244 hash += hash >> 5;
245 hash ^= hash << 4;
246 hash += hash >> 17;
247 hash ^= hash << 25;
248 hash += hash >> 6;
250 return hash;
254 uint ffHash (const(char)[] str) {
255 return FFhashOf(str.ptr, str.length);
260 * 64-bit implementation of fasthash
262 * Params:
263 * buf = data buffer
264 * seed = the seed
266 * Returns:
267 * 32-bit or 64-bit hash
269 usize FFhashOf (const(void)* buf, usize len, usize seed=0) @trusted pure nothrow @nogc {
270 enum Get8Bytes = q{
271 cast(ulong)data[0]|
272 (cast(ulong)data[1]<<8)|
273 (cast(ulong)data[2]<<16)|
274 (cast(ulong)data[3]<<24)|
275 (cast(ulong)data[4]<<32)|
276 (cast(ulong)data[5]<<40)|
277 (cast(ulong)data[6]<<48)|
278 (cast(ulong)data[7]<<56)
280 enum m = 0x880355f21e6d1965UL;
281 auto data = cast(const(ubyte)*)buf;
282 ulong h = seed;
283 ulong t;
284 foreach (immutable _; 0..len/8) {
285 version(HasUnalignedOps) {
286 if (__ctfe) {
287 t = mixin(Get8Bytes);
288 } else {
289 t = *cast(ulong*)data;
291 } else {
292 t = mixin(Get8Bytes);
294 data += 8;
295 t ^= t>>23;
296 t *= 0x2127599bf4325c37UL;
297 t ^= t>>47;
298 h ^= t;
299 h *= m;
302 h ^= len*m;
303 t = 0;
304 switch (len&7) {
305 case 7: t ^= cast(ulong)data[6]<<48; goto case 6;
306 case 6: t ^= cast(ulong)data[5]<<40; goto case 5;
307 case 5: t ^= cast(ulong)data[4]<<32; goto case 4;
308 case 4: t ^= cast(ulong)data[3]<<24; goto case 3;
309 case 3: t ^= cast(ulong)data[2]<<16; goto case 2;
310 case 2: t ^= cast(ulong)data[1]<<8; goto case 1;
311 case 1: t ^= cast(ulong)data[0]; goto default;
312 default:
313 t ^= t>>23;
314 t *= 0x2127599bf4325c37UL;
315 t ^= t>>47;
316 h ^= t;
317 h *= m;
318 break;
321 h ^= h>>23;
322 h *= 0x2127599bf4325c37UL;
323 h ^= h>>47;
324 static if (usize.sizeof == 4) {
325 // 32-bit hash
326 // the following trick converts the 64-bit hashcode to Fermat
327 // residue, which shall retain information from both the higher
328 // and lower parts of hashcode.
329 return cast(usize)(h-(h>>32));
330 } else {
331 return h;
336 uint jenkins_one_at_a_time_hash (const(void)* key, usize len) {
337 uint h;
338 auto data = cast(const(ubyte)*)key;
339 foreach (immutable _; 0..len) {
340 h += *data++;
341 h += (h << 10);
342 h ^= (h >> 6);
345 while (len != 0) {
346 h += len&0xff;
347 h += (h<<10);
348 h ^= (h>>6);
349 len >>= 8;
352 h += (h << 3);
353 h ^= (h >> 11);
354 h += (h << 15);
355 return h;
358 uint joaatHash (const(char)[] str) {
359 return jenkins_one_at_a_time_hash(str.ptr, str.length);
362 void main () {
363 loadWords();
364 testHash("fastHash64", &fastHash64!char);
365 testHash("fastHash32", &fastHash32!char);
366 testHash("siphash24", (const(char)[] buf) => sipHash24Of("0123456789abcdef", buf));
367 testHash("murhash", &murHash32!char);
368 testHash("innerhash", &innerhash);
369 testHash("outerhash", &outerhash);
370 testHash("djb2", &djb2);
371 testHash("djb2x", &djb2x);
372 testHash("sdbm", &sdbm);
373 testHash("sfHash", &sfHash);
374 testHash("ffHash", &ffHash);
375 testHash("joaatHash0", &joaatHash);
376 testHash("joaatHash1", &joaatHash32!char);
377 //testHash("loselose", &loselose);