avoid copying an array when unset a nonexistent key
[hiphop-php.git] / hphp / runtime / ext / collections / hash-collection.h
bloba44d8021b386afe079e3aec1fca168577ae3847f
1 #ifndef incl_HPHP_COLLECTIONS_HASHCOLLECTION_H
2 #define incl_HPHP_COLLECTIONS_HASHCOLLECTION_H
4 #include "hphp/runtime/ext/extension.h"
5 #include "hphp/runtime/ext/collections/ext_collections.h"
6 #include "hphp/runtime/base/mixed-array.h"
7 #include "hphp/runtime/base/mixed-array-defs.h"
8 #include "hphp/runtime/base/tv-refcount.h"
10 namespace HPHP {
11 /////////////////////////////////////////////////////////////////////////////
13 ALWAYS_INLINE MixedArray* CreateDictAsMixed() {
14 return MixedArray::asMixed(ArrayData::CreateDict());
17 // Common base class for BaseMap/BaseSet collections
18 struct HashCollection : ObjectData {
19 explicit HashCollection(Class* cls, HeaderKind kind)
20 : ObjectData(cls, NoInit{}, ObjectData::NoAttrs, kind)
21 , m_unusedAndSize(0)
22 , m_arr(CreateDictAsMixed())
24 explicit HashCollection(Class* cls, HeaderKind kind, ArrayData* arr)
25 : ObjectData(cls, NoInit{}, ObjectData::NoAttrs, kind)
26 , m_unusedAndSize(arr->m_size)
27 , m_arr(MixedArray::asMixed(arr))
29 explicit HashCollection(Class* cls, HeaderKind kind, uint32_t cap);
31 using Elm = MixedArray::Elm;
32 using hash_t = MixedArray::hash_t;
34 static const int32_t Empty = MixedArray::Empty;
35 static const int32_t Tombstone = MixedArray::Tombstone;
36 static const uint32_t LoadScale = MixedArray::LoadScale;
37 static const uint32_t SmallScale = MixedArray::SmallScale;
38 static const uint32_t SmallSize = MixedArray::SmallSize;
39 static const uint32_t MaxSize = MixedArray::MaxSize;
40 // HashCollections can only guarantee that it won't throw "cannot add
41 // element" exceptions if m_size <= MaxSize / 2. Therefore, we only allow
42 // reserve() to make room for up to MaxSize / 2 elements.
43 static const uint32_t MaxReserveSize = MaxSize / 2;
45 int64_t size() const {
46 return m_size;
49 Array toArray() = delete;
51 template <IntishCast intishCast = IntishCast::None>
52 Array toPHPArrayImpl() {
53 if (!m_size) {
54 return empty_array();
57 ArrayData* ad;
58 if (intishCast == IntishCast::None) {
59 ad = arrayData()->toPHPArray(true);
60 } else if (intishCast == IntishCast::Cast) {
61 ad = arrayData()->toPHPArrayIntishCast(true);
62 } else {
63 always_assert(false);
66 if (UNLIKELY(ad->size() < m_size)) warnOnStrIntDup();
67 assertx(m_size);
68 assertx(ad->m_pos == 0);
69 return Array::attach(ad);
72 Array toVArray();
73 Array toDArray();
75 Array toKeysArray();
76 Array toValuesArray();
78 bool contains(int64_t key) const {
79 return find(key, hash_int64(key)) != Empty;
81 bool contains(StringData* key) const {
82 return find(key, key->hash()) != Empty;
85 void remove(int64_t key);
86 void remove(StringData* key);
87 void eraseNoCompact(MixedArray::RemovePos pos);
88 void erase(MixedArray::RemovePos pos) {
89 eraseNoCompact(pos);
90 compactOrShrinkIfDensityTooLow();
93 bool isFull() { return posLimit() == cap(); }
94 bool isDensityTooLow() const {
95 bool b = (m_size < posLimit() / 2);
96 assertx(IMPLIES(
97 arrayData()->isStatic() && arrayData()->empty(),
99 ));
100 assertx(IMPLIES(cap() == 0, !b));
101 return b;
104 bool isCapacityTooHigh() const {
105 // Return true if current capacity at least 8x greater than m_size AND
106 // if current capacity is at least 8x greater than the minimum capacity
107 bool b = ((uint64_t(cap()) >= uint64_t(m_size) * 8) &&
108 (cap() >= HashCollection::SmallSize * 8));
109 assertx(IMPLIES(
110 arrayData()->isStatic() && arrayData()->empty(),
113 assertx(IMPLIES(cap() == 0, !b));
114 return b;
117 // grow() will increase the capacity of this HashCollection; newScale must
118 // be greater than or equal to the current scale so that the new cap and mask
119 // satisfy all the usual cap/mask invariants.
120 void grow(uint32_t newScale);
122 // resizeHelper() dups all of the elements (not copying tombstones) to a
123 // new buffer of the specified capacity and decRefs the old buffer. This
124 // method can be used to decrease this HashCollection's capacity.
125 void resizeHelper(uint32_t newCap);
127 // This method will increase capacity or compact as needed to make
128 // room to add one new element; it asserts that is is only called
129 // when isFull() is true
130 void makeRoom();
132 // This method performs an in-place compaction; it asserts that it
133 // is only called when isDensityTooLow() is true
134 void compact();
136 // This method reduces this HashCollection's capacity; it asserts that it
137 // is only called when isCapacityTooHigh() is true.
138 void shrink(uint32_t cap = 0);
140 // In general this method should be called after one or more elements
141 // have been removed. If density is too low, it will shrink or compact
142 // this HashCollection as appropriate.
143 void compactOrShrinkIfDensityTooLow() {
144 if (UNLIKELY(isDensityTooLow())) {
145 if (isCapacityTooHigh()) {
146 shrink();
147 } else {
148 compact();
153 // In general this method should be called after a speculative reserve
154 // and zero or more adds have been performed. If capacity is too high,
155 // it will shrink this HashCollection.
156 void shrinkIfCapacityTooHigh(uint32_t oldCap) {
157 if (UNLIKELY(isCapacityTooHigh() && cap() > oldCap)) {
158 shrink(oldCap);
162 HashCollection::Elm& allocElm(MixedArray::Inserter ei) {
163 assertx(canMutateBuffer());
164 assertx(MixedArray::isValidIns(ei) && !MixedArray::isValidPos(*ei)
165 && m_size <= posLimit() && posLimit() < cap());
166 size_t i = posLimit();
167 *ei = i;
168 setPosLimit(i + 1);
169 incSize();
170 return data()[i];
173 HashCollection::Elm& allocElmFront(MixedArray::Inserter ei);
175 // This method will grow or compact as needed in preparation for
176 // repeatedly adding new elements until m_size >= sz.
177 void reserve(int64_t sz);
179 // The iter functions below facilitate iteration over HashCollections.
180 // Iterators cannot store Elm pointers (because it's possible for m_data
181 // to change without bumping m_version in some cases), so indices are
182 // used instead.
184 bool iter_valid(ssize_t pos) const {
185 return pos < (ssize_t)posLimit();
188 bool iter_valid(ssize_t pos, ssize_t limit) const {
189 assertx(limit == (ssize_t)posLimit());
190 return pos < limit;
193 const Elm* iter_elm(ssize_t pos) const {
194 assertx(iter_valid(pos));
195 return &(data()[pos]);
198 ssize_t iter_begin() const {
199 ssize_t limit = posLimit();
200 ssize_t pos = 0;
201 for (; pos != limit; ++pos) {
202 auto* e = iter_elm(pos);
203 if (!isTombstone(e)) break;
205 return pos;
208 ssize_t iter_next(ssize_t pos) const {
209 ssize_t limit = posLimit();
210 for (++pos; pos < limit; ++pos) {
211 auto* e = iter_elm(pos);
212 if (!isTombstone(e)) return pos;
214 return limit;
217 ssize_t iter_prev(ssize_t pos) const {
218 ssize_t orig_pos = pos;
219 while (pos > 0) {
220 --pos;
221 auto* e = iter_elm(pos);
222 if (!isTombstone(e)) return pos;
224 return orig_pos;
227 Variant iter_key(ssize_t pos) const {
228 assertx(iter_valid(pos));
229 auto* e = iter_elm(pos);
230 if (e->hasStrKey()) {
231 return Variant{e->skey};
233 return (int64_t)e->ikey;
236 const TypedValue* iter_value(ssize_t pos) const {
237 assertx(iter_valid(pos));
238 return &iter_elm(pos)->data;
241 uint32_t nthElmPos(size_t n) const {
242 if (LIKELY(!hasTombstones())) {
243 // Fast path: HashCollection contains no tombstones
244 return n;
246 // Slow path: AssoCollection has at least one tombstone,
247 // so we need to count forward
248 // TODO Task# 4281431: If n > m_size/2 we could get better
249 // performance by starting at the end of the buffer and
250 // walking backward.
251 if (n >= m_size) {
252 return posLimit();
254 uint32_t pos = 0;
255 for (;;) {
256 while (isTombstone(pos)) {
257 assertx(pos + 1 < posLimit());
258 ++pos;
260 if (n <= 0) break;
261 --n;
262 assertx(pos + 1 < posLimit());
263 ++pos;
265 return pos;
269 * mutate() must be called before doing anything that mutates
270 * this HashCollection's buffer, unless it can be proven that
271 * canMutateBuffer() is true. mutate() takes care of updating
272 * m_immCopy and making a copy of this HashCollection's buffer
273 * if needed.
275 void mutate() {
276 assertx(IMPLIES(!m_immCopy.isNull(), arrayData()->hasMultipleRefs()));
277 if (arrayData()->cowCheck()) {
278 // mutateImpl() does two things for us. First it drops the
279 // immutable collection held by m_immCopy (if m_immCopy is not
280 // null). Second, it takes care of copying the buffer if needed.
281 mutateImpl();
283 assertx(canMutateBuffer());
284 assertx(m_immCopy.isNull());
287 void dropImmCopy() {
288 assertx(m_immCopy.isNull() ||
289 (arrayData() == ((HashCollection*)m_immCopy.get())->arrayData() &&
290 arrayData()->hasMultipleRefs()));
291 m_immCopy.reset();
294 TypedValue* findForUnserialize(int64_t k) {
295 auto h = hash_int64(k);
296 auto p = findForInsert(k, h);
297 if (UNLIKELY(MixedArray::isValidPos(*p))) return nullptr;
298 auto e = &allocElm(p);
299 e->setIntKey(k, h);
300 arrayData()->mutableKeyTypes()->recordInt();
301 updateNextKI(k);
302 return &e->data;
305 TypedValue* findForUnserialize(StringData* key) {
306 auto h = key->hash();
307 auto p = findForInsert(key, h);
308 if (UNLIKELY(MixedArray::isValidPos(*p))) return nullptr;
309 auto e = &allocElm(p);
310 e->setStrKey(key, h);
311 arrayData()->mutableKeyTypes()->recordStr(key);
312 return &e->data;
316 * Batch insertion interface that postpones hashing, for cache efficiency.
317 * Use these methods in order:
318 * 1. batchInsertBegin,
319 * 2. any number of batchInsert calls,
320 * 3. tryBatchInsertEnd or batchInsertAbort
321 * No other methods may be called between 1 and 3.
323 uint32_t batchInsertBegin(int64_t n) {
324 reserve(m_size + n);
325 return posLimit();
327 TypedValue* batchInsert(StringData* key) {
328 auto h = key->hash();
329 // Not hashing yet, so position is unused for now.
330 int32_t unusedPos = -1;
331 auto e = &allocElm(MixedArray::Inserter(&unusedPos));
332 e->setStrKey(key, h);
333 arrayData()->mutableKeyTypes()->recordStr(key);
334 return &e->data;
337 * Attempts to finalize a batch insertion, given the return value from the
338 * call to batchInsertBegin. Returns true on success, and aborts and returns
339 * false on failure (e.g., duplicate keys).
341 bool tryBatchInsertEnd(uint32_t begin) {
342 for (auto i = begin; i < posLimit(); ++i) {
343 auto e = data()[i];
344 auto h = e.hash();
345 auto p = e.hasStrKey() ?
346 findForInsert(e.skey, h) :
347 findForInsert(e.ikey, h);
348 if (UNLIKELY(MixedArray::isValidPos(*p))) {
349 batchInsertAbort(begin);
350 return false;
352 *p = i;
354 return true;
357 * Cancels a started batch insertion, given the return value from the call to
358 * batchInsertBegin, reverting to the state before it began. Idempotent.
360 void batchInsertAbort(uint32_t begin) {
361 for (auto i = posLimit(); i > begin; --i) {
362 auto& e = data()[i - 1];
363 auto tv = &e.data;
364 auto const old = *tv;
365 tv->m_type = kInvalidDataType;
366 decSize();
367 setPosLimit(i - 1);
368 if (e.hasStrKey()) decRefStr(e.skey);
369 tvDecRefGen(old);
373 static bool validPos(ssize_t pos) {
374 return pos >= 0;
377 static bool validPos(int32_t pos) {
378 return pos >= 0;
381 ALWAYS_INLINE
382 static std::array<TypedValue, 1>
383 makeArgsFromHashValue(const HashCollection::Elm& e) {
384 // note that this is a potentially unnecessary copy
385 // that might be reinterpret_cast ed away
386 // http://stackoverflow.com/questions/11205186/treat-c-cstyle-array-as-stdarray
387 return std::array<TypedValue, 1> {{ e.data }};
390 ALWAYS_INLINE
391 static std::array<TypedValue, 2>
392 makeArgsFromHashKeyAndValue(const HashCollection::Elm& e) {
393 return std::array<TypedValue, 2> {{
394 (e.hasIntKey()
395 ? make_tv<KindOfInt64>(e.ikey)
396 : make_tv<KindOfString>(e.skey)),
397 e.data
402 * canMutateBuffer() indicates whether it is currently safe to directly
403 * modify this HashCollection's buffer.
405 bool canMutateBuffer() const {
406 assertx(IMPLIES(!arrayData()->cowCheck(), m_immCopy.isNull()));
407 return !arrayData()->cowCheck();
410 static constexpr ptrdiff_t arrOffset() {
411 return offsetof(HashCollection, m_arr);
414 bool toBoolImpl() const {
415 return (m_size != 0);
418 ssize_t find(int64_t ki, inthash_t h) const {
419 return m_arr->find(ki, h);
422 ssize_t find(const StringData* s, strhash_t h) const {
423 return m_arr->find(s, h);
426 auto findForRemove(int64_t k, inthash_t h) const {
427 return m_arr->findForRemove(k, h);
430 auto findForRemove(const StringData* s, strhash_t h) const {
431 return m_arr->findForRemove(s, h);
434 MixedArray::Inserter findForInsert(int64_t ki, inthash_t h) const {
435 return m_arr->findForInsertUpdate(ki, h);
438 MixedArray::Inserter findForInsert(const StringData* s, strhash_t h) const {
439 return m_arr->findForInsertUpdate(s, h);
442 MixedArray::Inserter findForNewInsert(int32_t* table, size_t mask,
443 hash_t h0) const {
444 return m_arr->findForNewInsert(table, mask, h0);
447 static void copyElm(const Elm& frE, Elm& toE) {
448 memcpy(&toE, &frE, sizeof(Elm));
451 static void dupElm(const Elm& frE, Elm& toE) {
452 assertx(!isTombstoneType(frE.data.m_type));
453 memcpy(&toE, &frE, sizeof(Elm));
454 if (toE.hasStrKey()) toE.skey->incRefCount();
455 tvIncRefGen(toE.data);
458 MixedArray* arrayData() { return m_arr; }
459 const MixedArray* arrayData() const { return m_arr; }
462 * Copy the buffer and reset the immutable copy.
464 void mutateImpl();
466 Elm* data() { return m_arr->data(); }
467 const Elm* data() const { return m_arr->data(); }
468 int32_t* hashTab() const { return m_arr->hashTab(); }
470 void setSize(uint32_t sz) {
471 assertx(sz <= cap());
472 if (m_arr->isStatic() && m_arr->empty()) {
473 assertx(sz == 0);
474 return;
476 assertx(!arrayData()->hasMultipleRefs());
477 m_size = sz;
478 arrayData()->m_size = sz;
480 void incSize() {
481 assertx(m_size + 1 <= cap());
482 assertx(!arrayData()->hasMultipleRefs());
483 ++m_size;
484 arrayData()->m_size = m_size;
486 void decSize() {
487 assertx(m_size > 0);
488 assertx(!arrayData()->hasMultipleRefs());
489 --m_size;
490 arrayData()->m_size = m_size;
492 uint32_t scale() const {
493 return arrayData()->scale();
495 uint32_t cap() const {
496 return arrayData()->capacity();
498 uint32_t tableMask() const {
499 return arrayData()->mask();
501 uint32_t posLimit() const {
502 return arrayData()->m_used;
504 void incPosLimit() {
505 assertx(!arrayData()->hasMultipleRefs());
506 assertx(posLimit() + 1 <= cap());
507 arrayData()->m_used++;
509 void setPosLimit(uint32_t limit) {
510 auto* a = arrayData();
511 if (a->isStatic() && a->empty()) {
512 assertx(limit == 0);
513 return;
515 assertx(!a->hasMultipleRefs());
516 assertx(limit <= cap());
517 a->m_used = limit;
519 int64_t nextKI() {
520 return arrayData()->m_nextKI;
522 void setNextKI(int64_t ki) {
523 assertx(!arrayData()->hasMultipleRefs());
524 arrayData()->m_nextKI = ki;
526 void updateNextKI(int64_t ki) {
527 assertx(!arrayData()->hasMultipleRefs());
528 auto* a = arrayData();
529 if (ki >= a->m_nextKI && a->m_nextKI >= 0) {
530 a->m_nextKI = static_cast<uint64_t>(ki) + 1;
534 // The skipTombstonesNoBoundsCheck helper functions assume that either
535 // the specified location is not a tombstone OR that there is at least
536 // one non-tombstone after the specified position.
538 static int32_t
539 skipTombstonesNoBoundsCheck(int32_t pos, int32_t posLimit, const Elm* data) {
540 assertx(pos < posLimit);
541 while (isTombstone(pos, data)) {
542 ++pos;
543 assertx(pos < posLimit);
545 return pos;
548 int32_t skipTombstonesNoBoundsCheck(int32_t pos) {
549 return skipTombstonesNoBoundsCheck(pos, posLimit(), data());
552 const Elm* firstElmImpl() const {
553 const Elm* e = data();
554 const Elm* eLimit = elmLimit();
555 for (; e != eLimit && isTombstone(e); ++e) {}
556 return (Elm*)e;
558 Elm* firstElm() {
559 return (Elm*)firstElmImpl();
561 const Elm* firstElm() const {
562 return firstElmImpl();
565 Elm* elmLimit() {
566 return data() + posLimit();
568 const Elm* elmLimit() const {
569 return data() + posLimit();
572 static Elm* nextElm(Elm* e, Elm* eLimit) {
573 assertx(e != eLimit);
574 for (++e; e != eLimit && isTombstone(e); ++e) {}
575 return e;
577 static const Elm* nextElm(const Elm* e, const Elm* eLimit) {
578 return (const Elm*)nextElm((Elm*)e, (Elm*)eLimit);
581 static bool isTombstoneType(DataType t) {
582 return t == kInvalidDataType;
585 static bool isTombstone(const Elm* e) {
586 return isTombstoneType(e->data.m_type);
589 static bool isTombstone(ssize_t pos, const Elm* data) {
590 return isTombstoneType(data[pos].data.m_type);
593 bool isTombstone(ssize_t pos) const {
594 assertx(size_t(pos) <= posLimit());
595 return isTombstone(pos, data());
598 bool hasTombstones() const { return m_size != posLimit(); }
600 static uint32_t computeMaxElms(uint32_t tableMask) {
601 return tableMask - tableMask / LoadScale;
604 [[noreturn]] void throwTooLarge();
605 [[noreturn]] void throwReserveTooLarge();
606 int32_t* warnUnbalanced(size_t n, int32_t* ei) const;
609 * Raises a warning if the set contains an int and a string with the same
610 * numeric value: e.g. Set {'123', 123}. It's a no-op otherwise.
612 void warnOnStrIntDup() const;
614 void scan(type_scan::Scanner& scanner) const {
615 scanner.scan(m_arr);
616 scanner.scan(m_immCopy);
619 struct SortTmp {
620 SortTmp(HashCollection* h, SortFunction sf) : m_h(h) {
621 if (hasUserDefinedCmp(sf)) {
622 m_ad = MixedArray::Copy(m_h->m_arr);
623 } else {
624 m_h->mutate();
625 m_ad = m_h->m_arr;
628 ~SortTmp() {
629 if (m_h->m_arr != m_ad) {
630 Array tmp = Array::attach(m_h->m_arr);
631 assertx(m_ad->isDictKind());
632 m_h->m_arr = static_cast<MixedArray*>(m_ad);
635 ArrayData* operator->() { return m_ad; }
636 private:
637 HashCollection* m_h;
638 ArrayData* m_ad;
641 protected:
643 // Replace the m_arr field with a new MixedArray. The array must be known to
644 // *not* contain any references.
645 void replaceArray(ArrayData* adata) {
646 auto* oldAd = m_arr;
647 dropImmCopy();
648 m_arr = MixedArray::asMixed(adata);
649 adata->incRefCount();
650 m_size = adata->size();
651 decRefArr(oldAd);
654 union {
655 struct {
656 uint32_t m_size;
657 int32_t m_unused;
659 int64_t m_unusedAndSize;
662 MixedArray* m_arr; // Elm store.
664 // A pointer to an immutable collection that shares its buffer with
665 // this collection.
666 Object m_immCopy;
669 /////////////////////////////////////////////////////////////////////////////
671 #endif