App Engine Python SDK version 1.7.4 (2)
[gae.git] / python / google / appengine / api / search / stub / expression_evaluator.py
blob532345ad6a5b45a9a8453684076adca942bed0c8
1 #!/usr/bin/env python
3 # Copyright 2007 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
19 """Expression evaluator for Full Text Search API stub.
21 An associated ExpressionEvaluator object is created for every scored document in
22 search results, and that object evaluates all expressions for that document. The
23 expression syntax is detailed here:
25 https://developers.google.com/appengine/docs/python/search/overview#Expressions
27 Usage examples:
29 # Evaluate one expression for scored_doc
30 expression = search_service_pb.FieldSpec_Expression()
31 expression.set_name('total_value')
32 expression.set_expression('max(0, 3 * value + _score)')
33 ExpressionEvaluator(scored_doc, inverted_index).Evaluate(expression)
34 # scored_doc.expressions['total_value'] is now set to the expression result.
36 # Attach the result of all expressions for documents in scored_docs
37 for scored_doc in scored_docs:
38 evaluator = ExpressionEvaluator(scored_doc, inverted_index)
39 for expression in expression_protos:
40 evaluator.Evaluate(expression)
42 Note that this is not used for the production Full Text Search API; this
43 provides an approximation to the API for local testing with dev_appserver.
45 """
52 import logging
55 from google.appengine.api.search import expression_parser
56 from google.appengine.api.search import ExpressionParser
57 from google.appengine.api.search import query_parser
58 from google.appengine.api.search import search_util
59 from google.appengine.api.search.stub import simple_tokenizer
60 from google.appengine.api.search.stub import tokens
65 _SNIPPET_PREFIX = '...'
66 _SNIPPET_SUFFIX = '...'
69 class _ExpressionError(Exception):
70 """Raised when evaluating an expression fails."""
73 class ExpressionEvaluator(object):
74 """Evaluates an expression on scored documents."""
76 def __init__(self, document, inverted_index):
77 """Constructor.
79 Args:
80 document: The ScoredDocument to evaluate the expression for.
81 inverted_index: The search index (used for snippeting).
82 """
83 self._doc = document
84 self._doc_pb = document.document
85 self._inverted_index = inverted_index
86 self._tokenizer = simple_tokenizer.SimpleTokenizer(preserve_case=False)
87 self._case_preserving_tokenizer = simple_tokenizer.SimpleTokenizer(
88 preserve_case=True)
89 self._function_table = {
90 'max': self._Max,
91 'min': self._Min,
92 'count': self._Count,
93 'snippet': self._Snippet,
94 'distance': self._Unsupported('distance'),
97 def _Min(self, *nodes):
98 return min(self._Eval(node) for node in nodes)
100 def _Max(self, *nodes):
101 return max(self._Eval(node) for node in nodes)
103 def _Count(self, node):
104 return search_util.GetFieldCountInDocument(
105 self._doc_pb, query_parser.GetQueryNodeText(node))
107 def _GenerateSnippet(self, doc_words, position, max_length):
108 """Generate a snippet that fills a given length from a list of tokens.
110 Args:
111 doc_words: A list of tokens from the document.
112 position: The index of the highlighted word.
113 max_length: The maximum length of the output snippet.
115 Returns:
116 A summary of the given words with the word at index position highlighted.
118 snippet = '<b>%s</b>' % doc_words[position]
120 next_len, prev_len = 0, 0
121 if position + 1 < len(doc_words):
123 next_len = len(doc_words[position+1]) + 1
124 if position > 0:
126 prev_len = len(doc_words[position-1]) + 1
129 i = 1
131 length_offset = len(_SNIPPET_PREFIX) + len(_SNIPPET_SUFFIX)
132 while (len(snippet) + next_len + prev_len + length_offset < max_length and
133 (position + i < len(doc_words) or position - i > 0)):
134 if position + i < len(doc_words):
135 snippet = '%s %s' % (snippet, doc_words[position+i])
137 next_len = len(doc_words[position+i]) + 1
138 else:
139 next_len = 0
141 if position - i >= 0:
142 snippet = '%s %s' % (doc_words[position-i], snippet)
144 prev_len = len(doc_words[position-i]) + 1
145 else:
146 prev_len = 0
148 i += 1
149 return '%s%s%s' % (_SNIPPET_PREFIX, snippet, _SNIPPET_SUFFIX)
154 def _Snippet(self, query, field, *args):
155 """Create a snippet given a query and the field to query on.
157 Args:
158 query: A query string containing only a bare term (no operators).
159 field: The field name to query on.
160 *args: Unused optional arguments. These are not used on dev_appserver.
162 Returns:
163 A snippet for the field with the query term bolded.
165 field = query_parser.GetQueryNodeText(field)
166 terms = self._tokenizer.TokenizeText(
167 query_parser.GetQueryNodeText(query).strip('"'))
168 for term in terms:
169 search_token = tokens.Token(chars=u'%s:%s' % (field, term.chars))
170 postings = self._inverted_index.GetPostingsForToken(search_token)
171 for posting in postings:
172 if posting.doc_id != self._doc_pb.id() or not posting.positions:
173 continue
175 field_val = search_util.GetFieldValue(
176 search_util.GetFieldInDocument(self._doc_pb, field))
177 if not field_val:
178 continue
179 doc_words = [token.chars for token in
180 self._case_preserving_tokenizer.TokenizeText(field_val)]
182 position = posting.positions[0]
183 return self._GenerateSnippet(
184 doc_words, position, search_util.DEFAULT_MAX_SNIPPET_LENGTH)
185 else:
186 field_val = search_util.GetFieldValue(
187 search_util.GetFieldInDocument(self._doc_pb, field))
188 if not field_val:
189 return None
190 return '%s...' % field_val[:search_util.DEFAULT_MAX_SNIPPET_LENGTH]
192 def _Unsupported(self, method):
193 """Returns a function that raises an unsupported error when called.
195 This should be used for methods that are not yet implemented in
196 dev_appserver but are present in the API. If users call this function, the
197 expression will be skipped and a warning will be logged.
199 Args:
200 method: The name of the method that was called (used for logging).
202 Returns:
203 A function that raises a UnsupportedOnDevError when called.
209 def RaiseUnsupported(*args):
210 raise search_util.UnsupportedOnDevError(
211 '%s is currently unsupported on dev_appserver.' % method)
212 return RaiseUnsupported
214 def _EvalBinaryOp(self, op, op_name, node):
215 """Evaluate a binary operator on the document.
217 Args:
218 op: The operator function. Must take exactly two arguments.
219 op_name: The name of the operator. Used in error messages.
220 node: The expression AST node representing the operator application.
222 Returns:
223 The result of applying op to node's two children.
225 Raises:
226 ValueError: The node does not have exactly two children.
228 if len(node.children) != 2:
229 raise ValueError('%s operator must always have two arguments' % op_name)
230 n1, n2 = node.children
231 return op(self._Eval(n1), self._Eval(n2))
233 def _EvalUnaryOp(self, op, op_name, node):
234 """Evaluate a unary operator on the document.
236 Args:
237 op: The operator function. Must take exactly one argument.
238 op_name: The name of the operator. Used in error messages.
239 node: The expression AST node representing the operator application.
241 Returns:
242 The result of applying op to node's child.
244 Raises:
245 ValueError: The node does not have exactly one child.
247 if len(node.children) != 1:
248 raise ValueError('%s operator must always have one arguments' % op_name)
249 return op(self._Eval(node.children[0]))
251 def _Eval(self, node):
252 """Evaluate an expression node on the document.
254 Args:
255 node: The expression AST node representing an expression subtree.
257 Returns:
258 The Python value that maps to the value of node. Types are inferred from
259 the expression, so expressions with numeric results will return as python
260 int/long/floats, textual results will be strings, and dates will be
261 datetimes.
263 Raises:
264 _ExpressionError: The expression cannot be evaluated on this document
265 because either the expression is malformed or the document does not
266 contain the required fields. Callers of _Eval should catch
267 _ExpressionErrors and optionally log them; these are not fatal in any way,
268 and are used to indicate that this expression should not be set on this
269 document.
271 if node.getType() == ExpressionParser.FN:
272 func = self._function_table[query_parser.GetQueryNodeText(node)]
275 return func(*node.children)
277 if node.getType() == ExpressionParser.PLUS:
278 return self._EvalBinaryOp(lambda a, b: a + b, 'addition', node)
279 if node.getType() == ExpressionParser.MINUS:
280 return self._EvalBinaryOp(lambda a, b: a - b, 'subtraction', node)
281 if node.getType() == ExpressionParser.DIV:
282 return self._EvalBinaryOp(lambda a, b: a / b, 'division', node)
283 if node.getType() == ExpressionParser.TIMES:
284 return self._EvalBinaryOp(lambda a, b: a * b, 'multiplication', node)
285 if node.getType() == ExpressionParser.NEG:
286 return self._EvalUnaryOp(lambda a: -a, 'negation', node)
288 if node.getType() in (ExpressionParser.INT, ExpressionParser.FLOAT):
289 return float(query_parser.GetQueryNodeText(node))
290 if node.getType() == ExpressionParser.PHRASE:
291 return query_parser.GetQueryNodeText(node).strip('"')
293 if node.getType() == ExpressionParser.NAME:
294 name = query_parser.GetQueryNodeText(node)
295 if name == '_score':
296 return self._doc.score
297 field = search_util.GetFieldInDocument(self._doc_pb, name)
298 if field:
299 return search_util.GetFieldValue(field)
300 raise _ExpressionError('No field %s in document' % name)
302 raise _ExpressionError('Unable to handle node %s' % node)
304 def ValueOf(self, expression, default_value=None):
305 """Returns the value of an expression on a document.
307 Args:
308 expression: The expression string.
309 default_value: The value to return if the expression cannot be evaluated.
311 Returns:
312 The value of the expression on the evaluator's document, or default_value
313 if the expression cannot be evaluated on the document.
315 expression_tree = Parse(expression)
316 if not expression_tree.getType() and expression_tree.children:
317 expression_tree = expression_tree.children[0]
319 result = default_value
320 try:
321 result = self._Eval(expression_tree)
322 except _ExpressionError, e:
325 logging.debug('Skipping expression %s: %s', expression, e)
326 except search_util.UnsupportedOnDevError, e:
329 logging.warning(e.args[0])
331 return result
333 def Evaluate(self, expression):
334 """Evaluates the expression for a document and attaches the result.
336 Args:
337 expression: The Expression protobuffer object.
340 name = expression.name()
341 result = self.ValueOf(expression.expression())
342 if result != None:
343 self._doc.expressions[name] = result
346 def Parse(expression):
347 """Parse an expression and return its parse tree.
349 Args:
350 expression: An expression string.
352 Returns:
353 A parse tree for the expression, as generated by expression_parser.
355 return expression_parser.Parse(expression).tree