hphp/util/hash.h

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-2014 Facebook, Inc. (http://www.facebook.com)     |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15 */
  16
  17 #ifndef incl_HPHP_HASH_H_
  18 #define incl_HPHP_HASH_H_
  19
  20 #include <stdint.h>
  21 #include <cstring>
  22
  23 #include "hphp/util/portability.h"
  24
  25 namespace HPHP {
  26 ///////////////////////////////////////////////////////////////////////////////
  27
  28 typedef int32_t strhash_t;
  29 const strhash_t STRHASH_MASK = 0x7fffffff;
  30 const strhash_t STRHASH_MSB  = 0x80000000;
  31
  32 /*
  33  * "64 bit Mix Functions", from Thomas Wang's "Integer Hash Function."
  34  * http://www.concentric.net/~ttwang/tech/inthash.htm
  35  */
  36 inline long long hash_int64(long long key) {
  37   key = (~key) + (key << 21); // key = (key << 21) - key - 1;
  38   key = key ^ ((unsigned long long)key >> 24);
  39   key = (key + (key << 3)) + (key << 8); // key * 265
  40   key = key ^ ((unsigned long long)key >> 14);
  41   key = (key + (key << 2)) + (key << 4); // key * 21
  42   key = key ^ ((unsigned long long)key >> 28);
  43   key = key + (key << 31);
  44   return key < 0 ? -key : key;
  45 }
  46
  47 inline long long hash_int64_pair(long long k1, long long k2) {
  48   // Shift the first key, so (a,b) hashes somewhere other than (b,a)
  49   return (hash_int64(k1) << 1) ^ hash_int64(k2);
  50 }
  51
  52 namespace MurmurHash3 {
  53 ///////////////////////////////////////////////////////////////////////////////
  54 // The following code is based on MurmurHash3:
  55 //   http://code.google.com/p/smhasher/wiki/MurmurHash3
  56 //
  57 // The case-insensitive version converts lowercase characters to uppercase
  58 // under the assumption that character data are 7-bit ASCII. This should work
  59 // as identifiers usually only contain alphanumeric characters and the
  60 // underscore. Although PHP allows higher ASCII characters (> 127) in an
  61 // identifier, they should be very rare, and do not change the correctness.
  62
  63 // It is tempting to make useHash128 depend on whether the architecture is 32-
  64 // or 64-bit, but changing which hash function is used also requires
  65 // regenerating system files, so this setting is hardcoded here.
  66 const bool useHash128 = true;
  67
  68 #define ROTL32(x,y) rotl32(x,y)
  69 #define ROTL64(x,y) rotl64(x,y)
  70 #define BIG_CONSTANT(x) (x##LLU)
  71
  72 ALWAYS_INLINE uint64_t rotl64(uint64_t x, int8_t r) {
  73   return (x << r) | (x >> (64 - r));
  74 }
  75
  76 ALWAYS_INLINE uint32_t rotl32(uint32_t x, int8_t r) {
  77   return (x << r) | (x >> (32 - r));
  78 }
  79
  80 template <bool caseSensitive>
  81 ALWAYS_INLINE uint64_t getblock64(const uint64_t *p, int i) {
  82   uint64_t block = p[i];
  83   if (!caseSensitive) {
  84     block &= 0xdfdfdfdfdfdfdfdfLLU; // a-z => A-Z
  85   }
  86   return block;
  87 }
  88
  89 template <bool caseSensitive>
  90 ALWAYS_INLINE uint32_t getblock32(const uint32_t *p, int i) {
  91   uint32_t block = p[i];
  92   if (!caseSensitive) {
  93     block &= 0xdfdfdfdfU; // a-z => A-Z
  94   }
  95   return block;
  96 }
  97
  98 template <bool caseSensitive>
  99 ALWAYS_INLINE uint8_t getblock8(const uint8_t *p, int i) {
 100   uint8_t block = p[i];
 101   if (!caseSensitive) {
 102     block &= 0xdfU; // a-z => A-Z
 103   }
 104   return block;
 105 }
 106
 107 //-----------------------------------------------------------------------------
 108 // Finalization mix - force all bits of a hash block to avalanche
 109 ALWAYS_INLINE uint64_t fmix64(uint64_t k) {
 110   k ^= k >> 33;
 111   k *= BIG_CONSTANT(0xff51afd7ed558ccd);
 112   k ^= k >> 33;
 113   k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
 114   k ^= k >> 33;
 115   return k;
 116 }
 117
 118 ALWAYS_INLINE uint32_t fmix32(uint32_t h) {
 119   h ^= h >> 16;
 120   h *= 0x85ebca6b;
 121   h ^= h >> 13;
 122   h *= 0xc2b2ae35;
 123   h ^= h >> 16;
 124   return h;
 125 }
 126
 127 template <bool caseSensitive>
 128 ALWAYS_INLINE uint32_t hash32(const void *key, size_t len, uint32_t seed) {
 129   const uint8_t *data = (const uint8_t *)key;
 130   const size_t nblocks = len / 4;
 131   uint32_t h1 = seed;
 132   const uint32_t c1 = 0xcc9e2d51;
 133   const uint32_t c2 = 0x1b873593;
 134
 135   //----------
 136   // body
 137   const uint32_t *blocks = (const uint32_t *)(data + nblocks*4);
 138   for(size_t i = -nblocks; i; i++) {
 139     uint32_t k1 = getblock32<caseSensitive>(blocks, i);
 140     k1 *= c1;
 141     k1 = ROTL32(k1,15);
 142     k1 *= c2;
 143     h1 ^= k1;
 144     h1 = ROTL32(h1,13);
 145     h1 = h1*5+0xe6546b64;
 146   }
 147
 148   //----------
 149   // tail
 150   const uint8_t *tail = (const uint8_t*)(data + nblocks*4);
 151   uint32_t k1 = 0;
 152   switch(len & 3) {
 153   case 3: k1 ^= getblock8<caseSensitive>(tail, 2) << 16;
 154   case 2: k1 ^= getblock8<caseSensitive>(tail, 1) << 8;
 155   case 1: k1 ^= getblock8<caseSensitive>(tail, 0);
 156           k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
 157   };
 158
 159   //----------
 160   // finalization
 161   h1 ^= len;
 162   h1 = fmix32(h1);
 163
 164   return h1;
 165 }
 166
 167 // Optimized for 64-bit architectures.  MurmurHash3 also implements a 128-bit
 168 // hash that is optimized for 32-bit architectures (omitted here).
 169 template <bool caseSensitive>
 170 ALWAYS_INLINE void hash128(const void *key, size_t len, uint64_t seed,
 171                            uint64_t out[2]) {
 172   const uint8_t *data = (const uint8_t *)key;
 173   const size_t nblocks = len / 16;
 174   uint64_t h1 = seed;
 175   uint64_t h2 = seed;
 176   const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
 177   const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
 178
 179   //----------
 180   // body
 181   const uint64_t *blocks = (const uint64_t *)(data);
 182   for(size_t i = 0; i < nblocks; i++)
 183   {
 184     uint64_t k1 = getblock64<caseSensitive>(blocks,i*2+0);
 185     uint64_t k2 = getblock64<caseSensitive>(blocks,i*2+1);
 186     k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
 187     h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
 188     k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
 189     h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
 190   }
 191
 192   //----------
 193   // tail
 194   const uint8_t *tail = (const uint8_t*)(data + nblocks*16);
 195   uint64_t k1 = 0;
 196   uint64_t k2 = 0;
 197   switch(len & 15)
 198   {
 199   case 15: k2 ^= uint64_t(getblock8<caseSensitive>(tail, 14)) << 48;
 200   case 14: k2 ^= uint64_t(getblock8<caseSensitive>(tail, 13)) << 40;
 201   case 13: k2 ^= uint64_t(getblock8<caseSensitive>(tail, 12)) << 32;
 202   case 12: k2 ^= uint64_t(getblock8<caseSensitive>(tail, 11)) << 24;
 203   case 11: k2 ^= uint64_t(getblock8<caseSensitive>(tail, 10)) << 16;
 204   case 10: k2 ^= uint64_t(getblock8<caseSensitive>(tail,  9)) << 8;
 205   case  9: k2 ^= uint64_t(getblock8<caseSensitive>(tail,  8)) << 0;
 206            k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
 207
 208   case  8: k1 ^= uint64_t(getblock8<caseSensitive>(tail,  7)) << 56;
 209   case  7: k1 ^= uint64_t(getblock8<caseSensitive>(tail,  6)) << 48;
 210   case  6: k1 ^= uint64_t(getblock8<caseSensitive>(tail,  5)) << 40;
 211   case  5: k1 ^= uint64_t(getblock8<caseSensitive>(tail,  4)) << 32;
 212   case  4: k1 ^= uint64_t(getblock8<caseSensitive>(tail,  3)) << 24;
 213   case  3: k1 ^= uint64_t(getblock8<caseSensitive>(tail,  2)) << 16;
 214   case  2: k1 ^= uint64_t(getblock8<caseSensitive>(tail,  1)) << 8;
 215   case  1: k1 ^= uint64_t(getblock8<caseSensitive>(tail,  0)) << 0;
 216            k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
 217   };
 218
 219   //----------
 220   // finalization
 221   h1 ^= len; h2 ^= len;
 222   h1 += h2;
 223   h2 += h1;
 224   h1 = fmix64(h1);
 225   h2 = fmix64(h2);
 226   h1 += h2;
 227   h2 += h1;
 228
 229   ((uint64_t*)out)[0] = h1;
 230   ((uint64_t*)out)[1] = h2;
 231 }
 232
 233 #undef ROTL32
 234 #undef ROTL64
 235 #undef BIG_CONSTANT
 236 ///////////////////////////////////////////////////////////////////////////////
 237 } // namespace MurmurHash3
 238
 239 inline strhash_t hash_string_cs(const char *arKey, int nKeyLength) {
 240   if (MurmurHash3::useHash128) {
 241     uint64_t h[2];
 242     MurmurHash3::hash128<true>(arKey, nKeyLength, 0, h);
 243     return strhash_t(h[0] & STRHASH_MASK);
 244   } else {
 245     uint32_t h = MurmurHash3::hash32<true>(arKey, nKeyLength, 0);
 246     return strhash_t(h & STRHASH_MASK);
 247   }
 248 }
 249
 250 strhash_t hash_string_i(const char *arKey, int nKeyLength);
 251 strhash_t hash_string(const char *arKey, int nKeyLength);
 252
 253 inline strhash_t hash_string_i_inline(const char *arKey, int nKeyLength) {
 254   if (MurmurHash3::useHash128) {
 255     uint64_t h[2];
 256     MurmurHash3::hash128<false>(arKey, nKeyLength, 0, h);
 257     return strhash_t(h[0] & STRHASH_MASK);
 258   } else {
 259     uint32_t h = MurmurHash3::hash32<false>(arKey, nKeyLength, 0);
 260     return strhash_t(h & STRHASH_MASK);
 261   }
 262 }
 263
 264 inline strhash_t hash_string_inline(const char *arKey, int nKeyLength) {
 265   return hash_string_i(arKey, nKeyLength);
 266 }
 267
 268 /**
 269  * We probably should get rid of this, so to detect code generation errors,
 270  * where a binary string is treated as a NULL-terminated literal. Do we ever
 271  * allow binary strings as array keys or symbol names?
 272  */
 273 inline strhash_t hash_string(const char *arKey) {
 274   return hash_string(arKey, strlen(arKey));
 275 }
 276 inline strhash_t hash_string_i(const char *arKey) {
 277   return hash_string_i(arKey, strlen(arKey));
 278 }
 279
 280 // This function returns true and sets the res parameter if arKey
 281 // is a non-empty string that matches one of the following conditions:
 282 //   1) The string is "0".
 283 //   2) The string starts with a non-zero digit, followed by at most
 284 //      18 more digits, and is less than or equal to 2^63 - 1.
 285 //   3) The string starts with a negative sign, followed by a non-zero
 286 //      digit, followed by at most 18 more digits, and is greater than
 287 //      or equal to -2^63.
 288 inline bool is_strictly_integer(const char* arKey, size_t nKeyLength,
 289                                 int64_t& res) {
 290   if ((unsigned char)(arKey[0] - '-') > ('9' - '-'))
 291     return false;
 292   if (nKeyLength <= 19 ||
 293       (arKey[0] == '-' && nKeyLength == 20)) {
 294     unsigned long long num = 0;
 295     bool neg = false;
 296     unsigned i = 0;
 297     if (arKey[0] == '-') {
 298       neg = true;
 299       i = 1;
 300       // The string "-" is NOT strictly an integer
 301       if (nKeyLength == 1)
 302         return false;
 303       // A string that starts with "-0" is NOT strictly an integer
 304       if (arKey[1] == '0')
 305         return false;
 306     } else if (arKey[0] == '0') {
 307       // The string "0" is strictly an integer
 308       if (nKeyLength == 1) {
 309         res = 0;
 310         return true;
 311       }
 312       // A string that starts with "0" followed by at least one digit
 313       // is NOT strictly an integer
 314       return false;
 315     }
 316     bool good = true;
 317     for (; i < nKeyLength; ++i) {
 318       if (arKey[i] >= '0' && arKey[i] <= '9') {
 319         num = 10*num + arKey[i] - '0';
 320       }
 321       else {
 322         good = false;
 323         break;
 324       }
 325     }
 326     if (good) {
 327       if (num <= 0x7FFFFFFFFFFFFFFFULL ||
 328           (neg && num == 0x8000000000000000ULL)) {
 329         res = neg ? 0 - num : (long long)num;
 330         return true;
 331       }
 332     }
 333   }
 334   return false;
 335 }
 336
 337 ///////////////////////////////////////////////////////////////////////////////
 338 }
 339
 340 #endif // incl_HPHP_HASH_H_