From 9650fc6f46fca47fbd78bc9080a88d42b558c438 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Mon, 4 Dec 2017 11:46:46 +1300 Subject: [PATCH] Handle removing doc term at current TermIterator position Previously the underlying iterator was invalidated, leading to undefined behaviour (typically a segmentation fault). Reported by Gaurav Arora. --- xapian-core/api/documenttermlist.cc | 6 ++++ xapian-core/api/terminfo.cc | 20 ++++++++++-- xapian-core/api/terminfo.h | 55 +++++++++++++++++++++++++++++--- xapian-core/backends/documentinternal.cc | 4 ++- xapian-core/backends/documentinternal.h | 41 ++++++++++++++++-------- xapian-core/tests/api_none.cc | 29 +++++++++++++++++ 6 files changed, 134 insertions(+), 21 deletions(-) diff --git a/xapian-core/api/documenttermlist.cc b/xapian-core/api/documenttermlist.cc index 881f7823c..5e0903a1f 100644 --- a/xapian-core/api/documenttermlist.cc +++ b/xapian-core/api/documenttermlist.cc @@ -85,6 +85,9 @@ DocumentTermList::next() } else { ++it; } + while (it != doc->terms->end() && it->second.is_deleted()) { + ++it; + } return NULL; } @@ -92,6 +95,9 @@ TermList* DocumentTermList::skip_to(const string& term) { it = doc->terms->lower_bound(term); + while (it != doc->terms->end() && it->second.is_deleted()) { + ++it; + } return NULL; } diff --git a/xapian-core/api/terminfo.cc b/xapian-core/api/terminfo.cc index 5232d36e3..1babc4c8b 100644 --- a/xapian-core/api/terminfo.cc +++ b/xapian-core/api/terminfo.cc @@ -22,13 +22,24 @@ #include "terminfo.h" -void -TermInfo::add_position(Xapian::termpos termpos) +#include "omassert.h" + +bool +TermInfo::add_position(Xapian::termcount wdf_inc, Xapian::termpos termpos) { + if (rare(deleted)) { + wdf = wdf_inc; + deleted = false; + positions.push_back(termpos); + return true; + } + + wdf += wdf_inc; + // Optimise the common case of adding positions in ascending order. if (positions.empty() || termpos > positions.back()) { positions.push_back(termpos); - return; + return false; } // We keep positions sorted, so use lower_bound() which can binary chop to @@ -38,11 +49,14 @@ TermInfo::add_position(Xapian::termpos termpos) if (i == positions.cend() || *i != termpos) { positions.insert(i, termpos); } + return false; } bool TermInfo::remove_position(Xapian::termpos termpos) { + Assert(!deleted); + if (rare(positions.empty())) return false; diff --git a/xapian-core/api/terminfo.h b/xapian-core/api/terminfo.h index 6e7fc0b6b..761f84cbb 100644 --- a/xapian-core/api/terminfo.h +++ b/xapian-core/api/terminfo.h @@ -30,6 +30,13 @@ using namespace std; class TermInfo { Xapian::termcount wdf; + /** Flag to indicate if this term was deleted from this document. + * + * We flag entries as deleted instead of actually deleting them to avoid + * invalidating existing TermIterator objects. + */ + bool deleted = false; + /** Positions at which the term occurs. * * The entries are sorted in strictly increasing order (so duplicate @@ -44,6 +51,15 @@ class TermInfo { */ explicit TermInfo(Xapian::termcount wdf_) : wdf(wdf_) {} + /** Constructor which also adds an initial position. + * + * @param wdf_ Within-document frequency + * @param termpos Position to add + */ + TermInfo(Xapian::termcount wdf_, Xapian::termpos termpos) : wdf(wdf_) { + positions.push_back(termpos); + } + /// Get a pointer to the positions. const Xapian::VecCOW* get_positions() const { return &positions; @@ -52,13 +68,22 @@ class TermInfo { /// Get the within-document frequency. Xapian::termcount get_wdf() const { return wdf; } - /// Increase within-document frequency. - void operator+=(Xapian::termcount delta) { + /** Increase within-document frequency. + * + * @return true if the term was flagged as deleted before the operation. + */ + bool increase_wdf(Xapian::termcount delta) { + if (rare(deleted)) { + deleted = false; + wdf = delta; + return true; + } wdf += delta; + return false; } /// Decrease within-document frequency. - void operator-=(Xapian::termcount delta) { + void decrease_wdf(Xapian::termcount delta) { // Saturating arithmetic - don't let the wdf go below zero. if (wdf >= delta) { wdf -= delta; @@ -67,13 +92,32 @@ class TermInfo { } } + bool remove() { + if (deleted) + return false; + positions.clear(); + deleted = true; + return true; + } + /** Add a position. * * If @a termpos is already present, this is a no-op. * + * @param wdf_inc wdf increment * @param termpos Position to add + * + * @return true if the term was flagged as deleted before the operation. */ - void add_position(Xapian::termpos termpos); + bool add_position(Xapian::termcount wdf_inc, Xapian::termpos termpos); + + /** Append a position. + * + * The position must be >= the largest currently in the list. + */ + void append_position(Xapian::termpos termpos) { + positions.push_back(termpos); + } /** Remove a position. * @@ -82,6 +126,9 @@ class TermInfo { * @return If @a termpos wasn't present, returns false. */ bool remove_position(Xapian::termpos tpos); + + /// Is this term flagged as deleted? + bool is_deleted() const { return deleted; } }; #endif // XAPIAN_INCLUDED_TERMINFO_H diff --git a/xapian-core/backends/documentinternal.cc b/xapian-core/backends/documentinternal.cc index 2f12ebcbe..e540f87e2 100644 --- a/xapian-core/backends/documentinternal.cc +++ b/xapian-core/backends/documentinternal.cc @@ -40,16 +40,18 @@ Document::Internal::ensure_terms_fetched() const return; terms.reset(new map()); + termlist_size = 0; if (!database.get()) return; unique_ptr t(database->open_term_list(did)); while (t->next(), !t->at_end()) { + ++termlist_size; auto&& r = terms->emplace(make_pair(t->get_termname(), TermInfo(t->get_wdf()))); TermInfo& term = r.first->second; for (auto p = t->positionlist_begin(); p != PositionIterator(); ++p) { - term.add_position(*p); + term.append_position(*p); } } } diff --git a/xapian-core/backends/documentinternal.h b/xapian-core/backends/documentinternal.h index d77d000e3..2e25b39e7 100644 --- a/xapian-core/backends/documentinternal.h +++ b/xapian-core/backends/documentinternal.h @@ -70,6 +70,14 @@ class Document::Internal : public Xapian::Internal::intrusive_base { */ mutable std::unique_ptr> terms; + /** The number of distinct terms in @a terms. + * + * Only valid when terms is non-NULL. + * + * This may be less than terms.size() if any terms have been deleted. + */ + mutable Xapian::termcount termlist_size; + /** Are there any changes to term positions in @a terms? * * If a document is read from a database, modified and then replaced at @@ -237,10 +245,11 @@ class Document::Internal : public Xapian::Internal::intrusive_base { auto i = terms->find(term); if (i == terms->end()) { + ++termlist_size; terms->emplace(make_pair(term, TermInfo(wdf_inc))); } else { - if (wdf_inc) - i->second += wdf_inc; + if (i->second.increase_wdf(wdf_inc)) + ++termlist_size; } } @@ -252,9 +261,13 @@ class Document::Internal : public Xapian::Internal::intrusive_base { if (i == terms->end()) { return false; } - if (!positions_modified_) - positions_modified_ = !i->second.get_positions()->empty(); - terms->erase(i); + if (!i->second.get_positions()->empty()) { + positions_modified_ = true; + } + if (!i->second.remove()) { + return false; + } + --termlist_size; return true; } @@ -267,12 +280,12 @@ class Document::Internal : public Xapian::Internal::intrusive_base { auto i = terms->find(term); if (i == terms->end()) { - i = terms->emplace(make_pair(term, TermInfo(wdf_inc))).first; - } else { - if (wdf_inc) - i->second += wdf_inc; + ++termlist_size; + terms->emplace(term, TermInfo(wdf_inc, term_pos)); + return; } - i->second.add_position(term_pos); + if (i->second.add_position(wdf_inc, term_pos)) + ++termlist_size; } enum remove_posting_result { OK, NO_TERM, NO_POS }; @@ -285,14 +298,14 @@ class Document::Internal : public Xapian::Internal::intrusive_base { ensure_terms_fetched(); auto i = terms->find(term); - if (i == terms->end()) { + if (i == terms->end() || i->second.is_deleted()) { return remove_posting_result::NO_TERM; } if (!i->second.remove_position(term_pos)) { return remove_posting_result::NO_POS; } if (wdf_dec) - i->second -= wdf_dec; + i->second.decrease_wdf(wdf_dec); positions_modified_ = true; return remove_posting_result::OK; } @@ -302,12 +315,14 @@ class Document::Internal : public Xapian::Internal::intrusive_base { if (!terms) { if (database.get()) { terms.reset(new map()); + termlist_size = 0; } else { // We didn't come from a database, so there are no unfetched // terms to clear. } } else { terms->clear(); + termlist_size = 0; // Assume there was positional data if there's any in the database. positions_modified_ = database.get() && database->has_positions(); } @@ -316,7 +331,7 @@ class Document::Internal : public Xapian::Internal::intrusive_base { /// Return the number of distinct terms in this document. Xapian::termcount termlist_count() const { if (terms) - return terms->size(); + return termlist_size; if (!database.get()) return 0; diff --git a/xapian-core/tests/api_none.cc b/xapian-core/tests/api_none.cc index eaa701e09..6a30863ac 100644 --- a/xapian-core/tests/api_none.cc +++ b/xapian-core/tests/api_none.cc @@ -922,3 +922,32 @@ DEFINE_TESTCASE(orphaneddoctermitor1, !backend) { TEST_EQUAL(*t, "foo"); return true; } + +/** Test removal of terms from a document while iterating over them. + * + * Prior to 1.5.0 and 1.4.6 the underlying iterator was invalidated when + * preinc == false, leading to undefined behaviour (typically a segmentation + * fault). + */ +DEFINE_TESTCASE(deletewhileiterating1, !backend) { + for (bool preinc : { false, true }) { + Xapian::Document doc; + Xapian::TermGenerator indexer; + indexer.set_document(doc); + indexer.index_text("Pull the rug out from under ourselves", 1, "S"); + Xapian::TermIterator term_iterator = doc.termlist_begin(); + term_iterator.skip_to("S"); + while (term_iterator != doc.termlist_end()) { + const string& term = *term_iterator; + if (!startswith(term, "S")) { + break; + } + if (preinc) ++term_iterator; + doc.remove_term(term); + if (!preinc) ++term_iterator; + } + TEST_EQUAL(doc.termlist_count(), 0); + TEST(doc.termlist_begin() == doc.termlist_end()); + } + return true; +} -- 2.11.4.GIT