3 # Copyright 2007 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
21 """Document matcher for Full Text Search API stub.
23 DocumentMatcher provides an approximation of the Full Text Search API's query
31 from google
.appengine
.datastore
import document_pb
33 from google
.appengine
._internal
.antlr3
import tree
34 from google
.appengine
.api
.search
import query_parser
35 from google
.appengine
.api
.search
import QueryParser
36 from google
.appengine
.api
.search
import search_util
37 from google
.appengine
.api
.search
.stub
import simple_tokenizer
38 from google
.appengine
.api
.search
.stub
import tokens
41 class DocumentMatcher(object):
42 """A class to match documents with a query."""
44 def __init__(self
, query
, inverted_index
):
46 self
._inverted
_index
= inverted_index
47 self
._parser
= simple_tokenizer
.SimpleTokenizer()
49 def _PostingsForToken(self
, token
):
50 """Returns the postings for the token."""
51 return self
._inverted
_index
.GetPostingsForToken(token
)
53 def _PostingsForFieldToken(self
, field
, value
):
54 """Returns postings for the value occurring in the given field."""
55 value
= simple_tokenizer
.NormalizeString(value
)
56 return self
._PostingsForToken
(
57 tokens
.Token(chars
=value
, field_name
=field
))
59 def _MatchPhrase(self
, field
, match
, document
):
60 """Match a textual field with a phrase query node."""
61 field_text
= field
.value().string_value()
62 phrase_text
= query_parser
.GetPhraseQueryNodeText(match
)
65 if field
.value().type() == document_pb
.FieldValue
.ATOM
:
66 return (field_text
== phrase_text
)
68 phrase
= self
._parser
.TokenizeText(phrase_text
)
69 field_text
= self
._parser
.TokenizeText(field_text
)
73 for post
in self
._PostingsForFieldToken
(field
.name(), phrase
[0].chars
):
74 if post
.doc_id
== document
.id():
80 def ExtractWords(token_list
):
81 return (token
.chars
for token
in token_list
)
83 for position
in posting
.positions
:
88 match_words
= zip(ExtractWords(field_text
[position
:]),
90 if len(match_words
) != len(phrase
):
95 for doc_word
, match_word
in match_words
:
96 if doc_word
!= match_word
:
103 def _MatchTextField(self
, field
, match
, document
):
104 """Check if a textual field matches a query tree node."""
106 if (match
.getType() in (QueryParser
.TEXT
, QueryParser
.NAME
) or
107 match
.getType() in search_util
.NUMBER_QUERY_TYPES
):
109 if field
.value().type() == document_pb
.FieldValue
.ATOM
:
110 return (field
.value().string_value() ==
111 query_parser
.GetQueryNodeText(match
))
113 query_tokens
= self
._parser
.TokenizeText(
114 query_parser
.GetQueryNodeText(match
))
123 if len(query_tokens
) > 1:
124 def QueryNode(token
):
125 return query_parser
.CreateQueryNode(token
.chars
, QueryParser
.TEXT
)
126 return all(self
._MatchTextField
(field
, QueryNode(token
), document
)
127 for token
in query_tokens
)
129 token_text
= query_tokens
[0].chars
131 post
.doc_id
for post
in self
._PostingsForFieldToken
(
132 field
.name(), token_text
)]
133 return document
.id() in matching_docids
135 if match
.getType() == QueryParser
.PHRASE
:
136 return self
._MatchPhrase
(field
, match
, document
)
138 if match
.getType() == QueryParser
.CONJUNCTION
:
139 return all(self
._MatchTextField
(field
, child
, document
)
140 for child
in match
.children
)
142 if match
.getType() == QueryParser
.DISJUNCTION
:
143 return any(self
._MatchTextField
(field
, child
, document
)
144 for child
in match
.children
)
146 if match
.getType() == QueryParser
.NEGATION
:
147 return not self
._MatchTextField
(field
, match
.children
[0], document
)
152 def _MatchDateField(self
, field
, match
, document
):
153 """Check if a date field matches a query tree node."""
156 return self
._MatchComparableField
(
157 field
, match
, search_util
.DeserializeDate
,
158 search_util
.TEXT_QUERY_TYPES
, document
)
161 def _MatchNumericField(self
, field
, match
, document
):
162 """Check if a numeric field matches a query tree node."""
163 return self
._MatchComparableField
(
164 field
, match
, float, search_util
.NUMBER_QUERY_TYPES
, document
)
167 def _MatchComparableField(
168 self
, field
, match
, cast_to_type
, query_node_types
,
170 """A generic method to test matching for comparable types.
172 Comparable types are defined to be anything that supports <, >, <=, >=, ==
173 and !=. For our purposes, this is numbers and dates.
176 field: The document_pb.Field to test
177 match: The query node to match against
178 cast_to_type: The type to cast the node string values to
179 query_node_types: The query node types that would be valid matches
180 document: The document that the field is in
183 True iff the field matches the query.
186 UnsupportedOnDevError: Raised when an unsupported operator is used, or
187 when the query node is of the wrong type.
190 field_val
= cast_to_type(field
.value().string_value())
194 if match
.getType() in query_node_types
:
196 match_val
= cast_to_type(query_parser
.GetQueryNodeText(match
))
202 match_val
= cast_to_type(
203 query_parser
.GetQueryNodeText(match
.children
[0]))
209 if op
== QueryParser
.EQ
:
210 return field_val
== match_val
211 if op
== QueryParser
.NE
:
212 return field_val
!= match_val
213 if op
== QueryParser
.GT
:
214 return field_val
> match_val
215 if op
== QueryParser
.GE
:
216 return field_val
>= match_val
217 if op
== QueryParser
.LT
:
218 return field_val
< match_val
219 if op
== QueryParser
.LE
:
220 return field_val
<= match_val
221 raise search_util
.UnsupportedOnDevError(
222 'Operator %s not supported for numerical fields on development server.'
225 def _MatchField(self
, field
, match
, document
):
226 """Check if a field matches a query tree.
229 field_query_node: Either a string containing the name of a field, a query
230 node whose text is the name of the field, or a document_pb.Field.
231 match: A query node to match the field with.
232 document: The document to match.
235 if isinstance(field
, (basestring
, tree
.CommonTree
)):
236 if isinstance(field
, tree
.CommonTree
):
237 field
= field
.getText()
238 fields
= search_util
.GetAllFieldInDocument(document
, field
)
239 return any(self
._MatchField
(f
, match
, document
) for f
in fields
)
241 if field
.value().type() in search_util
.TEXT_DOCUMENT_FIELD_TYPES
:
242 return self
._MatchTextField
(field
, match
, document
)
244 if field
.value().type() in search_util
.NUMBER_DOCUMENT_FIELD_TYPES
:
245 return self
._MatchNumericField
(field
, match
, document
)
247 if field
.value().type() == document_pb
.FieldValue
.DATE
:
248 return self
._MatchDateField
(field
, match
, document
)
250 type_name
= document_pb
.FieldValue
.ContentType_Name(
251 field
.value().type()).lower()
252 raise search_util
.UnsupportedOnDevError(
253 'Matching fields of type %s is unsupported on dev server (searched for '
254 'field %s)' % (type_name
, field
.name()))
256 def _MatchGlobal(self
, match
, document
):
257 for field
in document
.field_list():
259 if self
._MatchField
(field
.name(), match
, document
):
261 except search_util
.UnsupportedOnDevError
:
268 def _CheckMatch(self
, node
, document
):
269 """Check if a document matches a query tree."""
271 if node
.getType() == QueryParser
.CONJUNCTION
:
272 return all(self
._CheckMatch
(child
, document
) for child
in node
.children
)
274 if node
.getType() == QueryParser
.DISJUNCTION
:
275 return any(self
._CheckMatch
(child
, document
) for child
in node
.children
)
277 if node
.getType() == QueryParser
.NEGATION
:
278 return not self
._CheckMatch
(node
.children
[0], document
)
280 if node
.getType() == QueryParser
.RESTRICTION
:
281 field
, match
= node
.children
282 return self
._MatchField
(field
, match
, document
)
284 return self
._MatchGlobal
(node
, document
)
286 def Matches(self
, document
):
288 return self
._CheckMatch
(self
._query
, document
)
289 except search_util
.UnsupportedOnDevError
, e
:
290 logging
.warning(str(e
))
293 def FilterDocuments(self
, documents
):
294 return (doc
for doc
in documents
if self
.Matches(doc
))