1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/query_parser/query_parser.h"
9 #include "base/compiler_specific.h"
10 #include "base/i18n/break_iterator.h"
11 #include "base/i18n/case_conversion.h"
12 #include "base/logging.h"
13 #include "base/stl_util.h"
14 #include "base/strings/utf_string_conversions.h"
16 namespace query_parser
{
19 // Returns true if |mp1.first| is less than |mp2.first|. This is used to
20 // sort match positions.
21 int CompareMatchPosition(const Snippet::MatchPosition
& mp1
,
22 const Snippet::MatchPosition
& mp2
) {
23 return mp1
.first
< mp2
.first
;
26 // Returns true if |mp2| intersects |mp1|. This is intended for use by
27 // CoalesceMatchesFrom and isn't meant as a general intersection comparison
29 bool SnippetIntersects(const Snippet::MatchPosition
& mp1
,
30 const Snippet::MatchPosition
& mp2
) {
31 return mp2
.first
>= mp1
.first
&& mp2
.first
<= mp1
.second
;
34 // Coalesces match positions in |matches| after index that intersect the match
35 // position at |index|.
36 void CoalesceMatchesFrom(size_t index
, Snippet::MatchPositions
* matches
) {
37 Snippet::MatchPosition
& mp
= (*matches
)[index
];
38 for (Snippet::MatchPositions::iterator i
= matches
->begin() + index
+ 1;
39 i
!= matches
->end(); ) {
40 if (SnippetIntersects(mp
, *i
)) {
41 mp
.second
= std::max(mp
.second
, i
->second
);
42 i
= matches
->erase(i
);
49 // Returns true if the character is considered a quote.
50 bool IsQueryQuote(wchar_t ch
) {
52 ch
== 0xab || // left pointing double angle bracket
53 ch
== 0xbb || // right pointing double angle bracket
54 ch
== 0x201c || // left double quotation mark
55 ch
== 0x201d || // right double quotation mark
56 ch
== 0x201e; // double low-9 quotation mark
61 // Inheritance structure:
62 // Queries are represented as trees of QueryNodes.
63 // QueryNodes are either a collection of subnodes (a QueryNodeList)
64 // or a single word (a QueryNodeWord).
66 // A QueryNodeWord is a single word in the query.
67 class QueryNodeWord
: public QueryNode
{
69 explicit QueryNodeWord(const base::string16
& word
,
70 MatchingAlgorithm matching_algorithm
);
71 ~QueryNodeWord() override
;
73 const base::string16
& word() const { return word_
; }
75 bool literal() const { return literal_
; };
76 void set_literal(bool literal
) { literal_
= literal
; }
79 int AppendToSQLiteQuery(base::string16
* query
) const override
;
80 bool IsWord() const override
;
81 bool Matches(const base::string16
& word
, bool exact
) const override
;
82 bool HasMatchIn(const QueryWordVector
& words
,
83 Snippet::MatchPositions
* match_positions
) const override
;
84 bool HasMatchIn(const QueryWordVector
& words
) const override
;
85 void AppendWords(std::vector
<base::string16
>* words
) const override
;
90 const MatchingAlgorithm matching_algorithm_
;
92 DISALLOW_COPY_AND_ASSIGN(QueryNodeWord
);
95 QueryNodeWord::QueryNodeWord(const base::string16
& word
,
96 MatchingAlgorithm matching_algorithm
)
99 matching_algorithm_(matching_algorithm
) {}
101 QueryNodeWord::~QueryNodeWord() {}
103 int QueryNodeWord::AppendToSQLiteQuery(base::string16
* query
) const {
104 query
->append(word_
);
106 // Use prefix search if we're not literal and long enough.
108 QueryParser::IsWordLongEnoughForPrefixSearch(word_
, matching_algorithm_
))
113 bool QueryNodeWord::IsWord() const {
117 bool QueryNodeWord::Matches(const base::string16
& word
, bool exact
) const {
119 !QueryParser::IsWordLongEnoughForPrefixSearch(word_
, matching_algorithm_
))
120 return word
== word_
;
121 return word
.size() >= word_
.size() &&
122 (word_
.compare(0, word_
.size(), word
, 0, word_
.size()) == 0);
125 bool QueryNodeWord::HasMatchIn(const QueryWordVector
& words
,
126 Snippet::MatchPositions
* match_positions
) const {
127 bool matched
= false;
128 for (size_t i
= 0; i
< words
.size(); ++i
) {
129 if (Matches(words
[i
].word
, false)) {
130 size_t match_start
= words
[i
].position
;
131 match_positions
->push_back(
132 Snippet::MatchPosition(match_start
,
133 match_start
+ static_cast<int>(word_
.size())));
140 bool QueryNodeWord::HasMatchIn(const QueryWordVector
& words
) const {
141 for (size_t i
= 0; i
< words
.size(); ++i
) {
142 if (Matches(words
[i
].word
, false))
148 void QueryNodeWord::AppendWords(std::vector
<base::string16
>* words
) const {
149 words
->push_back(word_
);
152 // A QueryNodeList has a collection of QueryNodes which are deleted in the end.
153 class QueryNodeList
: public QueryNode
{
156 ~QueryNodeList() override
;
158 QueryNodeStarVector
* children() { return &children_
; }
160 void AddChild(QueryNode
* node
);
162 // Remove empty subnodes left over from other parsing.
163 void RemoveEmptySubnodes();
166 int AppendToSQLiteQuery(base::string16
* query
) const override
;
167 bool IsWord() const override
;
168 bool Matches(const base::string16
& word
, bool exact
) const override
;
169 bool HasMatchIn(const QueryWordVector
& words
,
170 Snippet::MatchPositions
* match_positions
) const override
;
171 bool HasMatchIn(const QueryWordVector
& words
) const override
;
172 void AppendWords(std::vector
<base::string16
>* words
) const override
;
175 int AppendChildrenToString(base::string16
* query
) const;
177 QueryNodeStarVector children_
;
180 DISALLOW_COPY_AND_ASSIGN(QueryNodeList
);
183 QueryNodeList::QueryNodeList() {}
185 QueryNodeList::~QueryNodeList() {
186 STLDeleteElements(&children_
);
189 void QueryNodeList::AddChild(QueryNode
* node
) {
190 children_
.push_back(node
);
193 void QueryNodeList::RemoveEmptySubnodes() {
194 for (size_t i
= 0; i
< children_
.size(); ++i
) {
195 if (children_
[i
]->IsWord())
198 QueryNodeList
* list_node
= static_cast<QueryNodeList
*>(children_
[i
]);
199 list_node
->RemoveEmptySubnodes();
200 if (list_node
->children()->empty()) {
201 children_
.erase(children_
.begin() + i
);
208 int QueryNodeList::AppendToSQLiteQuery(base::string16
* query
) const {
209 return AppendChildrenToString(query
);
212 bool QueryNodeList::IsWord() const {
216 bool QueryNodeList::Matches(const base::string16
& word
, bool exact
) const {
221 bool QueryNodeList::HasMatchIn(const QueryWordVector
& words
,
222 Snippet::MatchPositions
* match_positions
) const {
227 bool QueryNodeList::HasMatchIn(const QueryWordVector
& words
) const {
232 void QueryNodeList::AppendWords(std::vector
<base::string16
>* words
) const {
233 for (size_t i
= 0; i
< children_
.size(); ++i
)
234 children_
[i
]->AppendWords(words
);
237 int QueryNodeList::AppendChildrenToString(base::string16
* query
) const {
239 for (QueryNodeStarVector::const_iterator node
= children_
.begin();
240 node
!= children_
.end(); ++node
) {
241 if (node
!= children_
.begin())
242 query
->push_back(L
' ');
243 num_words
+= (*node
)->AppendToSQLiteQuery(query
);
248 // A QueryNodePhrase is a phrase query ("quoted").
249 class QueryNodePhrase
: public QueryNodeList
{
252 ~QueryNodePhrase() override
;
255 int AppendToSQLiteQuery(base::string16
* query
) const override
;
256 bool HasMatchIn(const QueryWordVector
& words
,
257 Snippet::MatchPositions
* match_positions
) const override
;
258 bool HasMatchIn(const QueryWordVector
& words
) const override
;
261 bool MatchesAll(const QueryWordVector
& words
,
262 const QueryWord
** first_word
,
263 const QueryWord
** last_word
) const;
264 DISALLOW_COPY_AND_ASSIGN(QueryNodePhrase
);
267 QueryNodePhrase::QueryNodePhrase() {}
269 QueryNodePhrase::~QueryNodePhrase() {}
271 int QueryNodePhrase::AppendToSQLiteQuery(base::string16
* query
) const {
272 query
->push_back(L
'"');
273 int num_words
= AppendChildrenToString(query
);
274 query
->push_back(L
'"');
278 bool QueryNodePhrase::MatchesAll(const QueryWordVector
& words
,
279 const QueryWord
** first_word
,
280 const QueryWord
** last_word
) const {
281 if (words
.size() < children_
.size())
284 for (size_t i
= 0, max
= words
.size() - children_
.size() + 1; i
< max
; ++i
) {
285 bool matched_all
= true;
286 for (size_t j
= 0; j
< children_
.size(); ++j
) {
287 if (!children_
[j
]->Matches(words
[i
+ j
].word
, true)) {
293 *first_word
= &words
[i
];
294 *last_word
= &words
[i
+ children_
.size() - 1];
301 bool QueryNodePhrase::HasMatchIn(
302 const QueryWordVector
& words
,
303 Snippet::MatchPositions
* match_positions
) const {
304 const QueryWord
* first_word
;
305 const QueryWord
* last_word
;
307 if (MatchesAll(words
, &first_word
, &last_word
)) {
308 match_positions
->push_back(
309 Snippet::MatchPosition(first_word
->position
,
310 last_word
->position
+ last_word
->word
.length()));
316 bool QueryNodePhrase::HasMatchIn(const QueryWordVector
& words
) const {
317 const QueryWord
* first_word
;
318 const QueryWord
* last_word
;
319 return MatchesAll(words
, &first_word
, &last_word
);
322 QueryParser::QueryParser() {}
325 bool QueryParser::IsWordLongEnoughForPrefixSearch(
326 const base::string16
& word
, MatchingAlgorithm matching_algorithm
) {
327 if (matching_algorithm
== MatchingAlgorithm::ALWAYS_PREFIX_SEARCH
)
330 DCHECK(!word
.empty());
331 size_t minimum_length
= 3;
332 // We intentionally exclude Hangul Jamos (both Conjoining and compatibility)
333 // because they 'behave like' Latin letters. Moreover, we should
334 // normalize the former before reaching here.
335 if (0xAC00 <= word
[0] && word
[0] <= 0xD7A3)
337 return word
.size() >= minimum_length
;
340 int QueryParser::ParseQuery(const base::string16
& query
,
341 MatchingAlgorithm matching_algorithm
,
342 base::string16
* sqlite_query
) {
344 if (!ParseQueryImpl(query
, matching_algorithm
, &root
))
346 return root
.AppendToSQLiteQuery(sqlite_query
);
349 void QueryParser::ParseQueryWords(const base::string16
& query
,
350 MatchingAlgorithm matching_algorithm
,
351 std::vector
<base::string16
>* words
) {
353 if (!ParseQueryImpl(query
, matching_algorithm
, &root
))
355 root
.AppendWords(words
);
358 void QueryParser::ParseQueryNodes(const base::string16
& query
,
359 MatchingAlgorithm matching_algorithm
,
360 QueryNodeStarVector
* nodes
) {
362 if (ParseQueryImpl(base::i18n::ToLower(query
), matching_algorithm
, &root
))
363 nodes
->swap(*root
.children());
366 bool QueryParser::DoesQueryMatch(const base::string16
& text
,
367 const QueryNodeStarVector
& query_nodes
,
368 Snippet::MatchPositions
* match_positions
) {
369 if (query_nodes
.empty())
372 QueryWordVector query_words
;
373 base::string16 lower_text
= base::i18n::ToLower(text
);
374 ExtractQueryWords(lower_text
, &query_words
);
376 if (query_words
.empty())
379 Snippet::MatchPositions matches
;
380 for (size_t i
= 0; i
< query_nodes
.size(); ++i
) {
381 if (!query_nodes
[i
]->HasMatchIn(query_words
, &matches
))
384 if (lower_text
.length() != text
.length()) {
385 // The lower case string differs from the original string. The matches are
387 // TODO(sky): we need a better way to align the positions so that we don't
388 // completely punt here.
389 match_positions
->clear();
391 SortAndCoalesceMatchPositions(&matches
);
392 match_positions
->swap(matches
);
397 bool QueryParser::DoesQueryMatch(const QueryWordVector
& query_words
,
398 const QueryNodeStarVector
& query_nodes
) {
399 if (query_nodes
.empty() || query_words
.empty())
402 for (size_t i
= 0; i
< query_nodes
.size(); ++i
) {
403 if (!query_nodes
[i
]->HasMatchIn(query_words
))
409 bool QueryParser::ParseQueryImpl(const base::string16
& query
,
410 MatchingAlgorithm matching_algorithm
,
411 QueryNodeList
* root
) {
412 base::i18n::BreakIterator
iter(query
, base::i18n::BreakIterator::BREAK_WORD
);
413 // TODO(evanm): support a locale here
417 // To handle nesting, we maintain a stack of QueryNodeLists.
418 // The last element (back) of the stack contains the current, deepest node.
419 std::vector
<QueryNodeList
*> query_stack
;
420 query_stack
.push_back(root
);
422 bool in_quotes
= false; // whether we're currently in a quoted phrase
423 while (iter
.Advance()) {
424 // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
425 // is not necessarily a word, but could also be a sequence of punctuation
428 QueryNodeWord
* word_node
= new QueryNodeWord(iter
.GetString(),
431 word_node
->set_literal(true);
432 query_stack
.back()->AddChild(word_node
);
433 } else { // Punctuation.
434 if (IsQueryQuote(query
[iter
.prev()])) {
436 QueryNodeList
* quotes_node
= new QueryNodePhrase
;
437 query_stack
.back()->AddChild(quotes_node
);
438 query_stack
.push_back(quotes_node
);
441 query_stack
.pop_back(); // Stop adding to the quoted phrase.
448 root
->RemoveEmptySubnodes();
452 void QueryParser::ExtractQueryWords(const base::string16
& text
,
453 QueryWordVector
* words
) {
454 base::i18n::BreakIterator
iter(text
, base::i18n::BreakIterator::BREAK_WORD
);
455 // TODO(evanm): support a locale here
459 while (iter
.Advance()) {
460 // Just found a span between 'prev' (inclusive) and 'pos' (exclusive). It
461 // is not necessarily a word, but could also be a sequence of punctuation
464 base::string16 word
= iter
.GetString();
466 words
->push_back(QueryWord());
467 words
->back().word
= word
;
468 words
->back().position
= iter
.prev();
475 void QueryParser::SortAndCoalesceMatchPositions(
476 Snippet::MatchPositions
* matches
) {
477 std::sort(matches
->begin(), matches
->end(), &CompareMatchPosition
);
478 // WARNING: we don't use iterator here as CoalesceMatchesFrom may remove
480 for (size_t i
= 0; i
< matches
->size(); ++i
)
481 CoalesceMatchesFrom(i
, matches
);
484 } // namespace query_parser