hphp/runtime/ext/collections/hash-collection.h

   1 #ifndef incl_HPHP_COLLECTIONS_HASHCOLLECTION_H
   2 #define incl_HPHP_COLLECTIONS_HASHCOLLECTION_H
   3
   4 #include "hphp/runtime/ext/extension.h"
   5 #include "hphp/runtime/ext/collections/ext_collections.h"
   6 #include "hphp/runtime/base/mixed-array.h"
   7 #include "hphp/runtime/base/mixed-array-defs.h"
   8 #include "hphp/runtime/base/tv-refcount.h"
   9
  10 namespace HPHP {
  11 /////////////////////////////////////////////////////////////////////////////
  12
  13 ALWAYS_INLINE MixedArray* CreateDictAsMixed() {
  14   return MixedArray::asMixed(ArrayData::CreateDict());
  15 }
  16
  17 // Common base class for BaseMap/BaseSet collections
  18 struct HashCollection : ObjectData {
  19   explicit HashCollection(Class* cls, HeaderKind kind)
  20     : ObjectData(cls, NoInit{}, ObjectData::NoAttrs, kind)
  21     , m_unusedAndSize(0)
  22     , m_arr(CreateDictAsMixed())
  23   {}
  24   explicit HashCollection(Class* cls, HeaderKind kind, ArrayData* arr)
  25     : ObjectData(cls, NoInit{}, ObjectData::NoAttrs, kind)
  26     , m_unusedAndSize(arr->m_size)
  27     , m_arr(MixedArray::asMixed(arr))
  28   {}
  29   explicit HashCollection(Class* cls, HeaderKind kind, uint32_t cap);
  30
  31   using Elm = MixedArray::Elm;
  32   using hash_t = MixedArray::hash_t;
  33
  34   static const int32_t Empty           = MixedArray::Empty;
  35   static const int32_t Tombstone       = MixedArray::Tombstone;
  36   static const uint32_t LoadScale      = MixedArray::LoadScale;
  37   static const uint32_t SmallScale     = MixedArray::SmallScale;
  38   static const uint32_t SmallSize      = MixedArray::SmallSize;
  39   static const uint32_t MaxSize        = MixedArray::MaxSize;
  40   // HashCollections can only guarantee that it won't throw "cannot add
  41   // element" exceptions if m_size <= MaxSize / 2. Therefore, we only allow
  42   // reserve() to make room for up to MaxSize / 2 elements.
  43   static const uint32_t MaxReserveSize = MaxSize / 2;
  44
  45   int64_t size() const {
  46     return m_size;
  47   }
  48
  49   Array toArray() = delete;
  50
  51   template <IntishCast intishCast = IntishCast::None>
  52   Array toPHPArrayImpl() {
  53     if (!m_size) {
  54       return empty_array();
  55     }
  56
  57     ArrayData* ad;
  58     if (intishCast == IntishCast::None) {
  59       ad = arrayData()->toPHPArray(true);
  60     } else if (intishCast == IntishCast::Cast) {
  61       ad = arrayData()->toPHPArrayIntishCast(true);
  62     } else {
  63       always_assert(false);
  64     }
  65
  66     if (UNLIKELY(ad->size() < m_size)) warnOnStrIntDup();
  67     assertx(m_size);
  68     assertx(ad->m_pos == 0);
  69     return Array::attach(ad);
  70   }
  71
  72   Array toVArray();
  73   Array toDArray();
  74
  75   Array toKeysArray();
  76   Array toValuesArray();
  77
  78   bool contains(int64_t key) const {
  79     return find(key, hash_int64(key)) != Empty;
  80   }
  81   bool contains(StringData* key) const {
  82     return find(key, key->hash()) != Empty;
  83   }
  84
  85   void remove(int64_t key);
  86   void remove(StringData* key);
  87   void eraseNoCompact(MixedArray::RemovePos pos);
  88   void erase(MixedArray::RemovePos pos) {
  89     eraseNoCompact(pos);
  90     compactOrShrinkIfDensityTooLow();
  91   }
  92
  93   bool isFull() { return posLimit() == cap(); }
  94   bool isDensityTooLow() const {
  95     bool b = (m_size < posLimit() / 2);
  96     assertx(IMPLIES(
  97       arrayData()->isStatic() && arrayData()->empty(),
  98       !b
  99     ));
 100     assertx(IMPLIES(cap() == 0, !b));
 101     return b;
 102   }
 103
 104   bool isCapacityTooHigh() const {
 105     // Return true if current capacity at least 8x greater than m_size AND
 106     // if current capacity is at least 8x greater than the minimum capacity
 107     bool b = ((uint64_t(cap()) >= uint64_t(m_size) * 8) &&
 108               (cap() >= HashCollection::SmallSize * 8));
 109     assertx(IMPLIES(
 110       arrayData()->isStatic() && arrayData()->empty(),
 111       !b
 112     ));
 113     assertx(IMPLIES(cap() == 0, !b));
 114     return b;
 115   }
 116
 117   // grow() will increase the capacity of this HashCollection; newScale must
 118   // be greater than or equal to the current scale so that the new cap and mask
 119   // satisfy all the usual cap/mask invariants.
 120   void grow(uint32_t newScale);
 121
 122   // resizeHelper() dups all of the elements (not copying tombstones) to a
 123   // new buffer of the specified capacity and decRefs the old buffer. This
 124   // method can be used to decrease this HashCollection's capacity.
 125   void resizeHelper(uint32_t newCap);
 126
 127   // This method will increase capacity or compact as needed to make
 128   // room to add one new element; it asserts that is is only called
 129   // when isFull() is true
 130   void makeRoom();
 131
 132   // This method performs an in-place compaction; it asserts that it
 133   // is only called when isDensityTooLow() is true
 134   void compact();
 135
 136   // This method reduces this HashCollection's capacity; it asserts that it
 137   // is only called when isCapacityTooHigh() is true.
 138   void shrink(uint32_t cap = 0);
 139
 140   // In general this method should be called after one or more elements
 141   // have been removed. If density is too low, it will shrink or compact
 142   // this HashCollection as appropriate.
 143   void compactOrShrinkIfDensityTooLow() {
 144     if (UNLIKELY(isDensityTooLow())) {
 145       if (isCapacityTooHigh()) {
 146         shrink();
 147       } else {
 148         compact();
 149       }
 150     }
 151   }
 152
 153   // In general this method should be called after a speculative reserve
 154   // and zero or more adds have been performed. If capacity is too high,
 155   // it will shrink this HashCollection.
 156   void shrinkIfCapacityTooHigh(uint32_t oldCap) {
 157     if (UNLIKELY(isCapacityTooHigh() && cap() > oldCap)) {
 158       shrink(oldCap);
 159     }
 160   }
 161
 162   HashCollection::Elm& allocElm(MixedArray::Inserter ei) {
 163     assertx(canMutateBuffer());
 164     assertx(MixedArray::isValidIns(ei) && !MixedArray::isValidPos(*ei)
 165            && m_size <= posLimit() && posLimit() < cap());
 166     size_t i = posLimit();
 167     *ei = i;
 168     setPosLimit(i + 1);
 169     incSize();
 170     return data()[i];
 171   }
 172
 173   HashCollection::Elm& allocElmFront(MixedArray::Inserter ei);
 174
 175   // This method will grow or compact as needed in preparation for
 176   // repeatedly adding new elements until m_size >= sz.
 177   void reserve(int64_t sz);
 178
 179   // The iter functions below facilitate iteration over HashCollections.
 180   // Iterators cannot store Elm pointers (because it's possible for m_data
 181   // to change without bumping m_version in some cases), so indices are
 182   // used instead.
 183
 184   bool iter_valid(ssize_t pos) const {
 185     return pos < (ssize_t)posLimit();
 186   }
 187
 188   bool iter_valid(ssize_t pos, ssize_t limit) const {
 189     assertx(limit == (ssize_t)posLimit());
 190     return pos < limit;
 191   }
 192
 193   const Elm* iter_elm(ssize_t pos) const {
 194     assertx(iter_valid(pos));
 195     return &(data()[pos]);
 196   }
 197
 198   ssize_t iter_begin() const {
 199     ssize_t limit = posLimit();
 200     ssize_t pos = 0;
 201     for (; pos != limit; ++pos) {
 202       auto* e = iter_elm(pos);
 203       if (!isTombstone(e)) break;
 204     }
 205     return pos;
 206   }
 207
 208   ssize_t iter_next(ssize_t pos) const {
 209     ssize_t limit = posLimit();
 210     for (++pos; pos < limit; ++pos) {
 211       auto* e = iter_elm(pos);
 212       if (!isTombstone(e)) return pos;
 213     }
 214     return limit;
 215   }
 216
 217   ssize_t iter_prev(ssize_t pos) const {
 218     ssize_t orig_pos = pos;
 219     while (pos > 0) {
 220       --pos;
 221       auto* e = iter_elm(pos);
 222       if (!isTombstone(e)) return pos;
 223     }
 224     return orig_pos;
 225   }
 226
 227   Variant iter_key(ssize_t pos) const {
 228     assertx(iter_valid(pos));
 229     auto* e = iter_elm(pos);
 230     if (e->hasStrKey()) {
 231       return Variant{e->skey};
 232     }
 233     return (int64_t)e->ikey;
 234   }
 235
 236   const TypedValue* iter_value(ssize_t pos) const {
 237     assertx(iter_valid(pos));
 238     return &iter_elm(pos)->data;
 239   }
 240
 241   uint32_t nthElmPos(size_t n) const {
 242     if (LIKELY(!hasTombstones())) {
 243       // Fast path: HashCollection contains no tombstones
 244       return n;
 245     }
 246     // Slow path: AssoCollection has at least one tombstone,
 247     // so we need to count forward
 248     // TODO Task# 4281431: If n > m_size/2 we could get better
 249     // performance by starting at the end of the buffer and
 250     // walking backward.
 251     if (n >= m_size) {
 252       return posLimit();
 253     }
 254     uint32_t pos = 0;
 255     for (;;) {
 256       while (isTombstone(pos)) {
 257         assertx(pos + 1 < posLimit());
 258         ++pos;
 259       }
 260       if (n <= 0) break;
 261       --n;
 262       assertx(pos + 1 < posLimit());
 263       ++pos;
 264     }
 265     return pos;
 266   }
 267
 268   /**
 269    * mutate() must be called before doing anything that mutates
 270    * this HashCollection's buffer, unless it can be proven that
 271    * canMutateBuffer() is true. mutate() takes care of updating
 272    * m_immCopy and making a copy of this HashCollection's buffer
 273    * if needed.
 274    */
 275   void mutate() {
 276     assertx(IMPLIES(!m_immCopy.isNull(), arrayData()->hasMultipleRefs()));
 277     if (arrayData()->cowCheck()) {
 278       // mutateImpl() does two things for us. First it drops the
 279       // immutable collection held by m_immCopy (if m_immCopy is not
 280       // null). Second, it takes care of copying the buffer if needed.
 281       mutateImpl();
 282     }
 283     assertx(canMutateBuffer());
 284     assertx(m_immCopy.isNull());
 285   }
 286
 287   void dropImmCopy() {
 288     assertx(m_immCopy.isNull() ||
 289            (arrayData() == ((HashCollection*)m_immCopy.get())->arrayData() &&
 290             arrayData()->hasMultipleRefs()));
 291     m_immCopy.reset();
 292   }
 293
 294   TypedValue* findForUnserialize(int64_t k) {
 295     auto h = hash_int64(k);
 296     auto p = findForInsert(k, h);
 297     if (UNLIKELY(MixedArray::isValidPos(*p))) return nullptr;
 298     auto e = &allocElm(p);
 299     e->setIntKey(k, h);
 300     arrayData()->mutableKeyTypes()->recordInt();
 301     updateNextKI(k);
 302     return &e->data;
 303   }
 304
 305   TypedValue* findForUnserialize(StringData* key) {
 306     auto h = key->hash();
 307     auto p = findForInsert(key, h);
 308     if (UNLIKELY(MixedArray::isValidPos(*p))) return nullptr;
 309     auto e = &allocElm(p);
 310     e->setStrKey(key, h);
 311     arrayData()->mutableKeyTypes()->recordStr(key);
 312     return &e->data;
 313   }
 314
 315   /*
 316    * Batch insertion interface that postpones hashing, for cache efficiency.
 317    * Use these methods in order:
 318    *  1. batchInsertBegin,
 319    *  2. any number of batchInsert calls,
 320    *  3. tryBatchInsertEnd or batchInsertAbort
 321    * No other methods may be called between 1 and 3.
 322    */
 323   uint32_t batchInsertBegin(int64_t n) {
 324     reserve(m_size + n);
 325     return posLimit();
 326   }
 327   TypedValue* batchInsert(StringData* key) {
 328     auto h = key->hash();
 329     // Not hashing yet, so position is unused for now.
 330     int32_t unusedPos = -1;
 331     auto e = &allocElm(MixedArray::Inserter(&unusedPos));
 332     e->setStrKey(key, h);
 333     arrayData()->mutableKeyTypes()->recordStr(key);
 334     return &e->data;
 335   }
 336   /*
 337    * Attempts to finalize a batch insertion, given the return value from the
 338    * call to batchInsertBegin. Returns true on success, and aborts and returns
 339    * false on failure (e.g., duplicate keys).
 340    */
 341   bool tryBatchInsertEnd(uint32_t begin) {
 342     for (auto i = begin; i < posLimit(); ++i) {
 343       auto e = data()[i];
 344       auto h = e.hash();
 345       auto p = e.hasStrKey() ?
 346         findForInsert(e.skey, h) :
 347         findForInsert(e.ikey, h);
 348       if (UNLIKELY(MixedArray::isValidPos(*p))) {
 349         batchInsertAbort(begin);
 350         return false;
 351       }
 352       *p = i;
 353     }
 354     return true;
 355   }
 356   /*
 357    * Cancels a started batch insertion, given the return value from the call to
 358    * batchInsertBegin, reverting to the state before it began. Idempotent.
 359    */
 360   void batchInsertAbort(uint32_t begin) {
 361     for (auto i = posLimit(); i > begin; --i) {
 362       auto& e = data()[i - 1];
 363       auto tv = &e.data;
 364       auto const old = *tv;
 365       tv->m_type = kInvalidDataType;
 366       decSize();
 367       setPosLimit(i - 1);
 368       if (e.hasStrKey()) decRefStr(e.skey);
 369       tvDecRefGen(old);
 370     }
 371   }
 372
 373   static bool validPos(ssize_t pos) {
 374     return pos >= 0;
 375   }
 376
 377   static bool validPos(int32_t pos) {
 378     return pos >= 0;
 379   }
 380
 381   ALWAYS_INLINE
 382   static std::array<TypedValue, 1>
 383   makeArgsFromHashValue(const HashCollection::Elm& e) {
 384     // note that this is a potentially unnecessary copy
 385     // that might be reinterpret_cast ed away
 386     // http://stackoverflow.com/questions/11205186/treat-c-cstyle-array-as-stdarray
 387     return std::array<TypedValue, 1> {{ e.data }};
 388   }
 389
 390   ALWAYS_INLINE
 391   static std::array<TypedValue, 2>
 392   makeArgsFromHashKeyAndValue(const HashCollection::Elm& e) {
 393     return std::array<TypedValue, 2> {{
 394       (e.hasIntKey()
 395         ? make_tv<KindOfInt64>(e.ikey)
 396         : make_tv<KindOfString>(e.skey)),
 397       e.data
 398     }};
 399   }
 400
 401   /**
 402    * canMutateBuffer() indicates whether it is currently safe to directly
 403    * modify this HashCollection's buffer.
 404    */
 405   bool canMutateBuffer() const {
 406     assertx(IMPLIES(!arrayData()->cowCheck(), m_immCopy.isNull()));
 407     return !arrayData()->cowCheck();
 408   }
 409
 410   static constexpr ptrdiff_t arrOffset() {
 411     return offsetof(HashCollection, m_arr);
 412   }
 413
 414   bool toBoolImpl() const {
 415     return (m_size != 0);
 416   }
 417
 418   ssize_t find(int64_t ki, inthash_t h) const {
 419     return m_arr->find(ki, h);
 420   }
 421
 422   ssize_t find(const StringData* s, strhash_t h) const {
 423     return m_arr->find(s, h);
 424   }
 425
 426   auto findForRemove(int64_t k, inthash_t h) const {
 427     return m_arr->findForRemove(k, h);
 428   }
 429
 430   auto findForRemove(const StringData* s, strhash_t h) const {
 431     return m_arr->findForRemove(s, h);
 432   }
 433
 434   MixedArray::Inserter findForInsert(int64_t ki, inthash_t h) const {
 435     return m_arr->findForInsertUpdate(ki, h);
 436   }
 437
 438   MixedArray::Inserter findForInsert(const StringData* s, strhash_t h) const {
 439     return m_arr->findForInsertUpdate(s, h);
 440   }
 441
 442   MixedArray::Inserter findForNewInsert(int32_t* table, size_t mask,
 443                                         hash_t h0) const {
 444     return m_arr->findForNewInsert(table, mask, h0);
 445   }
 446
 447   static void copyElm(const Elm& frE, Elm& toE) {
 448     memcpy(&toE, &frE, sizeof(Elm));
 449   }
 450
 451   static void dupElm(const Elm& frE, Elm& toE) {
 452     assertx(!isTombstoneType(frE.data.m_type));
 453     memcpy(&toE, &frE, sizeof(Elm));
 454     if (toE.hasStrKey()) toE.skey->incRefCount();
 455     tvIncRefGen(toE.data);
 456   }
 457
 458   MixedArray* arrayData() { return m_arr; }
 459   const MixedArray* arrayData() const { return m_arr; }
 460
 461   /**
 462    * Copy the buffer and reset the immutable copy.
 463    */
 464   void mutateImpl();
 465
 466   Elm* data() { return m_arr->data(); }
 467   const Elm* data() const { return m_arr->data(); }
 468   int32_t* hashTab() const { return m_arr->hashTab(); }
 469
 470   void setSize(uint32_t sz) {
 471     assertx(sz <= cap());
 472     if (m_arr->isStatic() && m_arr->empty()) {
 473       assertx(sz == 0);
 474       return;
 475     }
 476     assertx(!arrayData()->hasMultipleRefs());
 477     m_size = sz;
 478     arrayData()->m_size = sz;
 479   }
 480   void incSize() {
 481     assertx(m_size + 1 <= cap());
 482     assertx(!arrayData()->hasMultipleRefs());
 483     ++m_size;
 484     arrayData()->m_size = m_size;
 485   }
 486   void decSize() {
 487     assertx(m_size > 0);
 488     assertx(!arrayData()->hasMultipleRefs());
 489     --m_size;
 490     arrayData()->m_size = m_size;
 491   }
 492   uint32_t scale() const {
 493     return arrayData()->scale();
 494   }
 495   uint32_t cap() const {
 496     return arrayData()->capacity();
 497   }
 498   uint32_t tableMask() const {
 499     return arrayData()->mask();
 500   }
 501   uint32_t posLimit() const {
 502     return arrayData()->m_used;
 503   }
 504   void incPosLimit() {
 505     assertx(!arrayData()->hasMultipleRefs());
 506     assertx(posLimit() + 1 <= cap());
 507     arrayData()->m_used++;
 508   }
 509   void setPosLimit(uint32_t limit) {
 510     auto* a = arrayData();
 511     if (a->isStatic() && a->empty()) {
 512       assertx(limit == 0);
 513       return;
 514     }
 515     assertx(!a->hasMultipleRefs());
 516     assertx(limit <= cap());
 517     a->m_used = limit;
 518   }
 519   int64_t nextKI() {
 520     return arrayData()->m_nextKI;
 521   }
 522   void setNextKI(int64_t ki) {
 523     assertx(!arrayData()->hasMultipleRefs());
 524     arrayData()->m_nextKI = ki;
 525   }
 526   void updateNextKI(int64_t ki) {
 527     assertx(!arrayData()->hasMultipleRefs());
 528     auto* a = arrayData();
 529     if (ki >= a->m_nextKI && a->m_nextKI >= 0) {
 530       a->m_nextKI = static_cast<uint64_t>(ki) + 1;
 531     }
 532   }
 533
 534   // The skipTombstonesNoBoundsCheck helper functions assume that either
 535   // the specified location is not a tombstone OR that there is at least
 536   // one non-tombstone after the specified position.
 537
 538   static int32_t
 539   skipTombstonesNoBoundsCheck(int32_t pos, int32_t posLimit, const Elm* data) {
 540     assertx(pos < posLimit);
 541     while (isTombstone(pos, data)) {
 542       ++pos;
 543       assertx(pos < posLimit);
 544     }
 545     return pos;
 546   }
 547
 548   int32_t skipTombstonesNoBoundsCheck(int32_t pos) {
 549     return skipTombstonesNoBoundsCheck(pos, posLimit(), data());
 550   }
 551
 552   const Elm* firstElmImpl() const {
 553     const Elm* e = data();
 554     const Elm* eLimit = elmLimit();
 555     for (; e != eLimit && isTombstone(e); ++e) {}
 556     return (Elm*)e;
 557   }
 558   Elm* firstElm() {
 559     return (Elm*)firstElmImpl();
 560   }
 561   const Elm* firstElm() const {
 562     return firstElmImpl();
 563   }
 564
 565   Elm* elmLimit() {
 566     return data() + posLimit();
 567   }
 568   const Elm* elmLimit() const {
 569     return data() + posLimit();
 570   }
 571
 572   static Elm* nextElm(Elm* e, Elm* eLimit) {
 573     assertx(e != eLimit);
 574     for (++e; e != eLimit && isTombstone(e); ++e) {}
 575     return e;
 576   }
 577   static const Elm* nextElm(const Elm* e, const Elm* eLimit) {
 578     return (const Elm*)nextElm((Elm*)e, (Elm*)eLimit);
 579   }
 580
 581   static bool isTombstoneType(DataType t) {
 582     return t == kInvalidDataType;
 583   }
 584
 585   static bool isTombstone(const Elm* e) {
 586     return isTombstoneType(e->data.m_type);
 587   }
 588
 589   static bool isTombstone(ssize_t pos, const Elm* data) {
 590     return isTombstoneType(data[pos].data.m_type);
 591   }
 592
 593   bool isTombstone(ssize_t pos) const {
 594     assertx(size_t(pos) <= posLimit());
 595     return isTombstone(pos, data());
 596   }
 597
 598   bool hasTombstones() const { return m_size != posLimit(); }
 599
 600   static uint32_t computeMaxElms(uint32_t tableMask) {
 601     return tableMask - tableMask / LoadScale;
 602   }
 603
 604   [[noreturn]] void throwTooLarge();
 605   [[noreturn]] void throwReserveTooLarge();
 606   int32_t* warnUnbalanced(size_t n, int32_t* ei) const;
 607
 608   /**
 609    * Raises a warning if the set contains an int and a string with the same
 610    * numeric value: e.g. Set {'123', 123}. It's a no-op otherwise.
 611    */
 612   void warnOnStrIntDup() const;
 613
 614   void scan(type_scan::Scanner& scanner) const {
 615     scanner.scan(m_arr);
 616     scanner.scan(m_immCopy);
 617   }
 618
 619   struct SortTmp {
 620     SortTmp(HashCollection* h, SortFunction sf) : m_h(h) {
 621       if (hasUserDefinedCmp(sf)) {
 622         m_ad = MixedArray::Copy(m_h->m_arr);
 623       } else {
 624         m_h->mutate();
 625         m_ad = m_h->m_arr;
 626       }
 627     }
 628     ~SortTmp() {
 629       if (m_h->m_arr != m_ad) {
 630         Array tmp = Array::attach(m_h->m_arr);
 631         assertx(m_ad->isDictKind());
 632         m_h->m_arr = static_cast<MixedArray*>(m_ad);
 633       }
 634     }
 635     ArrayData* operator->() { return m_ad; }
 636    private:
 637     HashCollection* m_h;
 638     ArrayData* m_ad;
 639   };
 640
 641  protected:
 642
 643   // Replace the m_arr field with a new MixedArray. The array must be known to
 644   // *not* contain any references.
 645   void replaceArray(ArrayData* adata) {
 646     auto* oldAd = m_arr;
 647     dropImmCopy();
 648     m_arr = MixedArray::asMixed(adata);
 649     adata->incRefCount();
 650     m_size = adata->size();
 651     decRefArr(oldAd);
 652   }
 653
 654   union {
 655     struct {
 656       uint32_t m_size;
 657       int32_t m_unused;
 658     };
 659     int64_t m_unusedAndSize;
 660   };
 661
 662   MixedArray* m_arr;      // Elm store.
 663
 664   // A pointer to an immutable collection that shares its buffer with
 665   // this collection.
 666   Object m_immCopy;
 667 };
 668
 669 /////////////////////////////////////////////////////////////////////////////
 670 }
 671 #endif