hash/testhash.d

   1 /* Invisible Vector Library
   2  * coded by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
   3  * Understanding is not required. Only obedience.
   4  *
   5  * This program is free software: you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation, version 3 of the License ONLY.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
  16  */
  17 module testhash is aliced;
  18
  19 import std.traits;
  20
  21 import iv.hash.siphash;
  22 import iv.hash.murhash;
  23 import iv.hash.fasthash;
  24 import iv.hash.joaat;
  25
  26
  27 version(use_custom_dict) {
  28   enum WordsFile = "_00/zwords.txt";
  29 } else {
  30   enum WordsFile = "/usr/share/dict/words";
  31 }
  32
  33
  34 bool[string] words;
  35
  36
  37 void loadWords () {
  38   import std.stdio : File;
  39
  40   void addWord (const(char)[] w) {
  41     if (w.length == 0 || w in words) return;
  42     words[w.idup] = true;
  43   }
  44
  45   foreach (auto line; File(WordsFile).byLine) {
  46     import std.string : strip;
  47     line = line.strip;
  48     addWord(line);
  49     // all lower
  50     foreach (ref ch; line) if (ch >= 'A' && ch <= 'Z') ch += 32;
  51     addWord(line);
  52     // all upper
  53     foreach (ref ch; line) if (ch >= 'a' && ch <= 'z') ch -= 32;
  54     addWord(line);
  55     // first upper
  56     if (line[0] >= 'A' && line[0] <= 'Z') {
  57       line[0] += 32;
  58       addWord(line);
  59     }
  60   }
  61 }
  62
  63
  64 void testHash(T) (string name, T hfn)
  65 if (isCallable!T && (is(ReturnType!T == uint) || is(ReturnType!T == ulong)))
  66 {
  67   { import std.stdio : stdout; stdout.write(name, " ... "); stdout.flush(); }
  68   alias HType = ReturnType!T;
  69   usize[HType] collisions;
  70
  71   foreach (immutable w; words.byKey) {
  72     static if (__traits(compiles, { auto h = hfn(w); })) {
  73       auto h = hfn(w);
  74     } else {
  75       auto h = hfn(w, 0);
  76     }
  77     if (auto cc = h in collisions) {
  78       ++(*cc);
  79     } else {
  80       collisions[h] = 1;
  81     }
  82   }
  83
  84   usize maxCollision = 0, elements = 0, colls = 0;
  85   foreach (immutable c; collisions.byValue) {
  86     if (c > maxCollision) maxCollision = c;
  87     if (c) ++elements;
  88     if (c > 1) colls += c-1;
  89   }
  90   assert(maxCollision >= 1);
  91
  92   if (maxCollision == 1) {
  93     import std.stdio;
  94     writeln("perfect for ", words.length, " words!");
  95   } else {
  96     import std.algorithm : sort;
  97     import std.stdio;
  98     import std.array;
  99     writeln(maxCollision-1, " max collisions for ", words.length, " words, ", colls, " collisions total");
 100     auto cols = sort!"a>b"(collisions.values).array;
 101     cols ~= 0;
 102     usize idx = 0;
 103     //assert(cols[0] > 1);
 104     while (cols[idx] > 1) {
 105       uint count = 0;
 106       auto c = cols[idx];
 107       while (cols[idx] == c) { ++count; ++idx; }
 108       writeln("  ", count, " collisions for ", c, " times");
 109     }
 110   }
 111 }
 112
 113
 114 uint innerhash (const(void)[] kbuf) @trusted nothrow @nogc {
 115   int res;
 116   if (kbuf.length == int.sizeof) {
 117     import core.stdc.string : memcpy;
 118     memcpy(&res, kbuf.ptr, res.sizeof);
 119   } else {
 120     res = 751;
 121   }
 122   foreach (immutable bt; cast(const(ubyte)[])kbuf) res = res*31+bt;
 123   return (res*87767623)&int.max;
 124 }
 125
 126
 127 uint outerhash (const(void)[] kbuf) @trusted nothrow @nogc {
 128   int res = 774831917;
 129   foreach_reverse (immutable bt; cast(const(ubyte)[])kbuf) res = res*29+bt;
 130   return (res*5157883)&int.max;
 131 }
 132
 133
 134 uint djb2 (const(char)[] str) {
 135   uint hash = 5381;
 136   auto data = cast(const(ubyte)[])str;
 137   foreach (ubyte c; data) hash = ((hash << 5)+hash)+c; /* hash * 33 + c */
 138   auto len = str.length;
 139   while (len != 0) {
 140     hash = ((hash << 5)+hash)+(len&0xff); /* hash * 33 + c */
 141     len >>= 8;
 142   }
 143   return hash;
 144 }
 145
 146
 147 uint djb2x (const(char)[] str) {
 148   uint hash = 5381;
 149   auto data = cast(const(ubyte)[])str;
 150   foreach (ubyte c; data) hash = ((hash << 5)+hash)^c; /* hash * 33 ^ c */
 151   auto len = str.length;
 152   while (len != 0) {
 153     hash = ((hash << 5)+hash)^(len&0xff); /* hash * 33 ^ c */
 154     len >>= 8;
 155   }
 156   return hash;
 157 }
 158
 159
 160 uint sdbm (const(char)[] str) {
 161   uint hash = cast(uint)str.length;
 162   auto data = cast(const(ubyte)[])str;
 163   foreach (ubyte c; data) hash = c + (hash << 6) + (hash << 16) - hash;
 164   return hash;
 165 }
 166
 167
 168 uint loselose (const(char)[] str) {
 169   uint hash = 0;
 170   auto data = cast(const(ubyte)[])str;
 171   foreach (ubyte c; data) hash += c;
 172   return hash;
 173 }
 174
 175
 176 uint sfHash (const(char)[] str) {
 177   return SFhashOf(str.ptr, str.length);
 178 }
 179
 180
 181 usize SFhashOf( const (void)* buf, usize len, usize seed = 0 ) @trusted pure nothrow
 182 {
 183     /*
 184      * This is Paul Hsieh's SuperFastHash algorithm, described here:
 185      *   http://www.azillionmonkeys.com/qed/hash.html
 186      * It is protected by the following open source license:
 187      *   http://www.azillionmonkeys.com/qed/weblicense.html
 188      */
 189     static uint get16bits( const (ubyte)* x ) pure nothrow
 190     {
 191         // CTFE doesn't support casting ubyte* -> ushort*, so revert to
 192         // per-byte access when in CTFE.
 193         version( HasUnalignedOps )
 194         {
 195             if (!__ctfe)
 196                 return *cast(ushort*) x;
 197         }
 198
 199         return ((cast(uint) x[1]) << 8) + (cast(uint) x[0]);
 200     }
 201
 202     // NOTE: SuperFastHash normally starts with a zero hash value.  The seed
 203     //       value was incorporated to allow chaining.
 204     auto data = cast(const (ubyte)*) buf;
 205     auto hash = seed;
 206     int  rem;
 207
 208     if( len <= 0 || data is null )
 209         return 0;
 210
 211     rem = len & 3;
 212     len >>= 2;
 213
 214     for( ; len > 0; len-- )
 215     {
 216         hash += get16bits( data );
 217         auto tmp = (get16bits( data + 2 ) << 11) ^ hash;
 218         hash  = (hash << 16) ^ tmp;
 219         data += 2 * ushort.sizeof;
 220         hash += hash >> 11;
 221     }
 222
 223     switch( rem )
 224     {
 225     case 3: hash += get16bits( data );
 226             hash ^= hash << 16;
 227             hash ^= data[ushort.sizeof] << 18;
 228             hash += hash >> 11;
 229             break;
 230     case 2: hash += get16bits( data );
 231             hash ^= hash << 11;
 232             hash += hash >> 17;
 233             break;
 234     case 1: hash += *data;
 235             hash ^= hash << 10;
 236             hash += hash >> 1;
 237             break;
 238      default:
 239             break;
 240     }
 241
 242     /* Force "avalanching" of final 127 bits */
 243     hash ^= hash << 3;
 244     hash += hash >> 5;
 245     hash ^= hash << 4;
 246     hash += hash >> 17;
 247     hash ^= hash << 25;
 248     hash += hash >> 6;
 249
 250     return hash;
 251 }
 252
 253
 254 uint ffHash (const(char)[] str) {
 255   return FFhashOf(str.ptr, str.length);
 256 }
 257
 258
 259 /**
 260  * 64-bit implementation of fasthash
 261  *
 262  * Params:
 263  *   buf =  data buffer
 264  *   seed = the seed
 265  *
 266  * Returns:
 267  *   32-bit or 64-bit hash
 268  */
 269 usize FFhashOf (const(void)* buf, usize len, usize seed=0) @trusted pure nothrow @nogc {
 270   enum Get8Bytes = q{
 271     cast(ulong)data[0]|
 272     (cast(ulong)data[1]<<8)|
 273     (cast(ulong)data[2]<<16)|
 274     (cast(ulong)data[3]<<24)|
 275     (cast(ulong)data[4]<<32)|
 276     (cast(ulong)data[5]<<40)|
 277     (cast(ulong)data[6]<<48)|
 278     (cast(ulong)data[7]<<56)
 279   };
 280   enum m = 0x880355f21e6d1965UL;
 281   auto data = cast(const(ubyte)*)buf;
 282   ulong h = seed;
 283   ulong t;
 284   foreach (immutable _; 0..len/8) {
 285     version(HasUnalignedOps) {
 286       if (__ctfe) {
 287         t = mixin(Get8Bytes);
 288       } else {
 289         t = *cast(ulong*)data;
 290       }
 291     } else {
 292       t = mixin(Get8Bytes);
 293     }
 294     data += 8;
 295     t ^= t>>23;
 296     t *= 0x2127599bf4325c37UL;
 297     t ^= t>>47;
 298     h ^= t;
 299     h *= m;
 300   }
 301
 302   h ^= len*m;
 303   t = 0;
 304   switch (len&7) {
 305     case 7: t ^= cast(ulong)data[6]<<48; goto case 6;
 306     case 6: t ^= cast(ulong)data[5]<<40; goto case 5;
 307     case 5: t ^= cast(ulong)data[4]<<32; goto case 4;
 308     case 4: t ^= cast(ulong)data[3]<<24; goto case 3;
 309     case 3: t ^= cast(ulong)data[2]<<16; goto case 2;
 310     case 2: t ^= cast(ulong)data[1]<<8; goto case 1;
 311     case 1: t ^= cast(ulong)data[0]; goto default;
 312     default:
 313       t ^= t>>23;
 314       t *= 0x2127599bf4325c37UL;
 315       t ^= t>>47;
 316       h ^= t;
 317       h *= m;
 318       break;
 319   }
 320
 321   h ^= h>>23;
 322   h *= 0x2127599bf4325c37UL;
 323   h ^= h>>47;
 324   static if (usize.sizeof == 4) {
 325     // 32-bit hash
 326     // the following trick converts the 64-bit hashcode to Fermat
 327     // residue, which shall retain information from both the higher
 328     // and lower parts of hashcode.
 329     return cast(usize)(h-(h>>32));
 330   } else {
 331     return h;
 332   }
 333 }
 334
 335
 336 uint jenkins_one_at_a_time_hash (const(void)* key, usize len) {
 337   uint h;
 338   auto data = cast(const(ubyte)*)key;
 339   foreach (immutable _; 0..len) {
 340     h += *data++;
 341     h += (h << 10);
 342     h ^= (h >> 6);
 343   }
 344   /*
 345   while (len != 0) {
 346     h += len&0xff;
 347     h += (h<<10);
 348     h ^= (h>>6);
 349     len >>= 8;
 350   }
 351   */
 352   h += (h << 3);
 353   h ^= (h >> 11);
 354   h += (h << 15);
 355   return h;
 356 }
 357
 358 uint joaatHash (const(char)[] str) {
 359   return jenkins_one_at_a_time_hash(str.ptr, str.length);
 360 }
 361
 362 void main () {
 363   loadWords();
 364   testHash("fastHash64", &fastHash64!char);
 365   testHash("fastHash32", &fastHash32!char);
 366   testHash("siphash24", (const(char)[] buf) => sipHash24Of("0123456789abcdef", buf));
 367   testHash("murhash", &murHash32!char);
 368   testHash("innerhash", &innerhash);
 369   testHash("outerhash", &outerhash);
 370   testHash("djb2", &djb2);
 371   testHash("djb2x", &djb2x);
 372   testHash("sdbm", &sdbm);
 373   testHash("sfHash", &sfHash);
 374   testHash("ffHash", &ffHash);
 375   testHash("joaatHash0", &joaatHash);
 376   testHash("joaatHash1", &joaatHash32!char);
 377   //testHash("loselose", &loselose);
 378 }