App Engine Python SDK version 1.7.4 (2)
[gae.git] / python / google / appengine / api / search / stub / document_matcher.py
blobf53d774e85b3ff15da558cfed168dac2f8ea61b6
1 #!/usr/bin/env python
3 # Copyright 2007 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
21 """Document matcher for Full Text Search API stub.
23 DocumentMatcher provides an approximation of the Full Text Search API's query
24 matching.
25 """
29 import logging
31 from google.appengine.datastore import document_pb
33 from google.appengine._internal.antlr3 import tree
34 from google.appengine.api.search import query_parser
35 from google.appengine.api.search import QueryParser
36 from google.appengine.api.search import search_util
37 from google.appengine.api.search.stub import simple_tokenizer
38 from google.appengine.api.search.stub import tokens
41 class DocumentMatcher(object):
42 """A class to match documents with a query."""
44 def __init__(self, query, inverted_index):
45 self._query = query
46 self._inverted_index = inverted_index
47 self._parser = simple_tokenizer.SimpleTokenizer()
49 def _PostingsForToken(self, token):
50 """Returns the postings for the token."""
51 return self._inverted_index.GetPostingsForToken(token)
53 def _PostingsForFieldToken(self, field, value):
54 """Returns postings for the value occurring in the given field."""
55 value = simple_tokenizer.NormalizeString(value)
56 return self._PostingsForToken(
57 tokens.Token(chars=value, field_name=field))
59 def _MatchPhrase(self, field, match, document):
60 """Match a textual field with a phrase query node."""
61 field_text = field.value().string_value()
62 phrase_text = query_parser.GetPhraseQueryNodeText(match)
65 if field.value().type() == document_pb.FieldValue.ATOM:
66 return (field_text == phrase_text)
68 phrase = self._parser.TokenizeText(phrase_text)
69 field_text = self._parser.TokenizeText(field_text)
70 if not phrase:
71 return True
72 posting = None
73 for post in self._PostingsForFieldToken(field.name(), phrase[0].chars):
74 if post.doc_id == document.id():
75 posting = post
76 break
77 if not posting:
78 return False
80 def ExtractWords(token_list):
81 return (token.chars for token in token_list)
83 for position in posting.positions:
88 match_words = zip(ExtractWords(field_text[position:]),
89 ExtractWords(phrase))
90 if len(match_words) != len(phrase):
91 continue
94 match = True
95 for doc_word, match_word in match_words:
96 if doc_word != match_word:
97 match = False
99 if match:
100 return True
101 return False
103 def _MatchTextField(self, field, match, document):
104 """Check if a textual field matches a query tree node."""
106 if (match.getType() in (QueryParser.TEXT, QueryParser.NAME) or
107 match.getType() in search_util.NUMBER_QUERY_TYPES):
109 if field.value().type() == document_pb.FieldValue.ATOM:
110 return (field.value().string_value() ==
111 query_parser.GetQueryNodeText(match))
113 query_tokens = self._parser.TokenizeText(
114 query_parser.GetQueryNodeText(match))
117 if not query_tokens:
118 return True
123 if len(query_tokens) > 1:
124 def QueryNode(token):
125 return query_parser.CreateQueryNode(token.chars, QueryParser.TEXT)
126 return all(self._MatchTextField(field, QueryNode(token), document)
127 for token in query_tokens)
129 token_text = query_tokens[0].chars
130 matching_docids = [
131 post.doc_id for post in self._PostingsForFieldToken(
132 field.name(), token_text)]
133 return document.id() in matching_docids
135 if match.getType() == QueryParser.PHRASE:
136 return self._MatchPhrase(field, match, document)
138 if match.getType() == QueryParser.CONJUNCTION:
139 return all(self._MatchTextField(field, child, document)
140 for child in match.children)
142 if match.getType() == QueryParser.DISJUNCTION:
143 return any(self._MatchTextField(field, child, document)
144 for child in match.children)
146 if match.getType() == QueryParser.NEGATION:
147 return not self._MatchTextField(field, match.children[0], document)
150 return False
152 def _MatchDateField(self, field, match, document):
153 """Check if a date field matches a query tree node."""
156 return self._MatchComparableField(
157 field, match, search_util.DeserializeDate,
158 search_util.TEXT_QUERY_TYPES, document)
161 def _MatchNumericField(self, field, match, document):
162 """Check if a numeric field matches a query tree node."""
163 return self._MatchComparableField(
164 field, match, float, search_util.NUMBER_QUERY_TYPES, document)
167 def _MatchComparableField(
168 self, field, match, cast_to_type, query_node_types,
169 document):
170 """A generic method to test matching for comparable types.
172 Comparable types are defined to be anything that supports <, >, <=, >=, ==
173 and !=. For our purposes, this is numbers and dates.
175 Args:
176 field: The document_pb.Field to test
177 match: The query node to match against
178 cast_to_type: The type to cast the node string values to
179 query_node_types: The query node types that would be valid matches
180 document: The document that the field is in
182 Returns:
183 True iff the field matches the query.
185 Raises:
186 UnsupportedOnDevError: Raised when an unsupported operator is used, or
187 when the query node is of the wrong type.
190 field_val = cast_to_type(field.value().string_value())
192 op = QueryParser.EQ
194 if match.getType() in query_node_types:
195 try:
196 match_val = cast_to_type(query_parser.GetQueryNodeText(match))
197 except ValueError:
198 return False
199 elif match.children:
200 op = match.getType()
201 try:
202 match_val = cast_to_type(
203 query_parser.GetQueryNodeText(match.children[0]))
204 except ValueError:
205 return False
206 else:
207 return False
209 if op == QueryParser.EQ:
210 return field_val == match_val
211 if op == QueryParser.NE:
212 return field_val != match_val
213 if op == QueryParser.GT:
214 return field_val > match_val
215 if op == QueryParser.GE:
216 return field_val >= match_val
217 if op == QueryParser.LT:
218 return field_val < match_val
219 if op == QueryParser.LE:
220 return field_val <= match_val
221 raise search_util.UnsupportedOnDevError(
222 'Operator %s not supported for numerical fields on development server.'
223 % match.getText())
225 def _MatchField(self, field, match, document):
226 """Check if a field matches a query tree.
228 Args:
229 field_query_node: Either a string containing the name of a field, a query
230 node whose text is the name of the field, or a document_pb.Field.
231 match: A query node to match the field with.
232 document: The document to match.
235 if isinstance(field, (basestring, tree.CommonTree)):
236 if isinstance(field, tree.CommonTree):
237 field = field.getText()
238 fields = search_util.GetAllFieldInDocument(document, field)
239 return any(self._MatchField(f, match, document) for f in fields)
241 if field.value().type() in search_util.TEXT_DOCUMENT_FIELD_TYPES:
242 return self._MatchTextField(field, match, document)
244 if field.value().type() in search_util.NUMBER_DOCUMENT_FIELD_TYPES:
245 return self._MatchNumericField(field, match, document)
247 if field.value().type() == document_pb.FieldValue.DATE:
248 return self._MatchDateField(field, match, document)
250 type_name = document_pb.FieldValue.ContentType_Name(
251 field.value().type()).lower()
252 raise search_util.UnsupportedOnDevError(
253 'Matching fields of type %s is unsupported on dev server (searched for '
254 'field %s)' % (type_name, field.name()))
256 def _MatchGlobal(self, match, document):
257 for field in document.field_list():
258 try:
259 if self._MatchField(field.name(), match, document):
260 return True
261 except search_util.UnsupportedOnDevError:
265 pass
266 return False
268 def _CheckMatch(self, node, document):
269 """Check if a document matches a query tree."""
271 if node.getType() == QueryParser.CONJUNCTION:
272 return all(self._CheckMatch(child, document) for child in node.children)
274 if node.getType() == QueryParser.DISJUNCTION:
275 return any(self._CheckMatch(child, document) for child in node.children)
277 if node.getType() == QueryParser.NEGATION:
278 return not self._CheckMatch(node.children[0], document)
280 if node.getType() == QueryParser.RESTRICTION:
281 field, match = node.children
282 return self._MatchField(field, match, document)
284 return self._MatchGlobal(node, document)
286 def Matches(self, document):
287 try:
288 return self._CheckMatch(self._query, document)
289 except search_util.UnsupportedOnDevError, e:
290 logging.warning(str(e))
291 return False
293 def FilterDocuments(self, documents):
294 return (doc for doc in documents if self.Matches(doc))