App Engine Python SDK version 1.8.1
[gae.git] / python / google / appengine / api / search / stub / expression_evaluator.py
blobad0dbba0c619c68f1ffbda429ae9fd7d578e4d78
1 #!/usr/bin/env python
3 # Copyright 2007 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
19 """Expression evaluator for Full Text Search API stub.
21 An associated ExpressionEvaluator object is created for every scored document in
22 search results, and that object evaluates all expressions for that document. The
23 expression syntax is detailed here:
25 https://developers.google.com/appengine/docs/python/search/overview#Expressions
27 Usage examples:
29 # Evaluate one expression for scored_doc
30 expression = search_service_pb.FieldSpec_Expression()
31 expression.set_name('total_value')
32 expression.set_expression('max(0, 3 * value + _score)')
33 ExpressionEvaluator(scored_doc, inverted_index).Evaluate(expression)
34 # scored_doc.expressions['total_value'] is now set to the expression result.
36 # Attach the result of all expressions for documents in scored_docs
37 for scored_doc in scored_docs:
38 evaluator = ExpressionEvaluator(scored_doc, inverted_index)
39 for expression in expression_protos:
40 evaluator.Evaluate(expression)
42 Note that this is not used for the production Full Text Search API; this
43 provides an approximation to the API for local testing with dev_appserver.
45 """
52 import logging
55 from google.appengine.api.search import expression_parser
56 from google.appengine.api.search import ExpressionParser
57 from google.appengine.api.search import query_parser
58 from google.appengine.api.search import search_util
59 from google.appengine.api.search.stub import simple_tokenizer
60 from google.appengine.api.search.stub import tokens
65 _SNIPPET_PREFIX = '...'
66 _SNIPPET_SUFFIX = '...'
69 class _ExpressionError(Exception):
70 """Raised when evaluating an expression fails."""
73 class ExpressionEvaluator(object):
74 """Evaluates an expression on scored documents."""
76 def __init__(self, document, inverted_index):
77 """Constructor.
79 Args:
80 document: The ScoredDocument to evaluate the expression for.
81 inverted_index: The search index (used for snippeting).
82 """
83 self._doc = document
84 self._doc_pb = document.document
85 self._inverted_index = inverted_index
86 self._tokenizer = simple_tokenizer.SimpleTokenizer(preserve_case=False)
87 self._case_preserving_tokenizer = simple_tokenizer.SimpleTokenizer(
88 preserve_case=True)
89 self._function_table = {
90 ExpressionParser.ABS: self._Unsupported('abs'),
91 ExpressionParser.COUNT: self._Count,
92 ExpressionParser.DISTANCE: self._Unsupported('distance'),
93 ExpressionParser.GEOPOINT: self._Unsupported('geopoint'),
94 ExpressionParser.LEN: self._Unsupported('len'),
95 ExpressionParser.LOG: self._Unsupported('log'),
96 ExpressionParser.MAX: self._Max,
97 ExpressionParser.MIN: self._Min,
98 ExpressionParser.POW: self._Unsupported('pow'),
99 ExpressionParser.SNIPPET: self._Snippet,
100 ExpressionParser.SWITCH: self._Unsupported('switch'),
103 def _Min(self, *nodes):
104 return min(self._Eval(node) for node in nodes)
106 def _Max(self, *nodes):
107 return max(self._Eval(node) for node in nodes)
109 def _Count(self, node):
110 return search_util.GetFieldCountInDocument(
111 self._doc_pb, query_parser.GetQueryNodeText(node))
113 def _GenerateSnippet(self, doc_words, position, max_length):
114 """Generate a snippet that fills a given length from a list of tokens.
116 Args:
117 doc_words: A list of tokens from the document.
118 position: The index of the highlighted word.
119 max_length: The maximum length of the output snippet.
121 Returns:
122 A summary of the given words with the word at index position highlighted.
124 snippet = '<b>%s</b>' % doc_words[position]
126 next_len, prev_len = 0, 0
127 if position + 1 < len(doc_words):
129 next_len = len(doc_words[position+1]) + 1
130 if position > 0:
132 prev_len = len(doc_words[position-1]) + 1
135 i = 1
137 length_offset = len(_SNIPPET_PREFIX) + len(_SNIPPET_SUFFIX)
138 while (len(snippet) + next_len + prev_len + length_offset < max_length and
139 (position + i < len(doc_words) or position - i > 0)):
140 if position + i < len(doc_words):
141 snippet = '%s %s' % (snippet, doc_words[position+i])
143 next_len = len(doc_words[position+i]) + 1
144 else:
145 next_len = 0
147 if position - i >= 0:
148 snippet = '%s %s' % (doc_words[position-i], snippet)
150 prev_len = len(doc_words[position-i]) + 1
151 else:
152 prev_len = 0
154 i += 1
155 return '%s%s%s' % (_SNIPPET_PREFIX, snippet, _SNIPPET_SUFFIX)
160 def _Snippet(self, query, field, *args):
161 """Create a snippet given a query and the field to query on.
163 Args:
164 query: A query string containing only a bare term (no operators).
165 field: The field name to query on.
166 *args: Unused optional arguments. These are not used on dev_appserver.
168 Returns:
169 A snippet for the field with the query term bolded.
171 field = query_parser.GetQueryNodeText(field)
172 terms = self._tokenizer.TokenizeText(
173 query_parser.GetQueryNodeText(query).strip('"'))
174 for term in terms:
175 search_token = tokens.Token(chars=u'%s:%s' % (field, term.chars))
176 postings = self._inverted_index.GetPostingsForToken(search_token)
177 for posting in postings:
178 if posting.doc_id != self._doc_pb.id() or not posting.positions:
179 continue
181 field_val = search_util.GetFieldValue(
182 search_util.GetFieldInDocument(self._doc_pb, field))
183 if not field_val:
184 continue
185 doc_words = [token.chars for token in
186 self._case_preserving_tokenizer.TokenizeText(field_val)]
188 position = posting.positions[0]
189 return self._GenerateSnippet(
190 doc_words, position, search_util.DEFAULT_MAX_SNIPPET_LENGTH)
191 else:
192 field_val = search_util.GetFieldValue(
193 search_util.GetFieldInDocument(self._doc_pb, field))
194 if not field_val:
195 return ''
196 return '%s...' % field_val[:search_util.DEFAULT_MAX_SNIPPET_LENGTH]
198 def _Unsupported(self, method):
199 """Returns a function that raises an unsupported error when called.
201 This should be used for methods that are not yet implemented in
202 dev_appserver but are present in the API. If users call this function, the
203 expression will be skipped and a warning will be logged.
205 Args:
206 method: The name of the method that was called (used for logging).
208 Returns:
209 A function that raises a UnsupportedOnDevError when called.
215 def RaiseUnsupported(*args):
216 raise search_util.UnsupportedOnDevError(
217 '%s is currently unsupported on dev_appserver.' % method)
218 return RaiseUnsupported
220 def _EvalBinaryOp(self, op, op_name, node):
221 """Evaluate a binary operator on the document.
223 Args:
224 op: The operator function. Must take exactly two arguments.
225 op_name: The name of the operator. Used in error messages.
226 node: The expression AST node representing the operator application.
228 Returns:
229 The result of applying op to node's two children.
231 Raises:
232 ValueError: The node does not have exactly two children.
234 if len(node.children) != 2:
235 raise ValueError('%s operator must always have two arguments' % op_name)
236 n1, n2 = node.children
237 return op(self._Eval(n1), self._Eval(n2))
239 def _EvalUnaryOp(self, op, op_name, node):
240 """Evaluate a unary operator on the document.
242 Args:
243 op: The operator function. Must take exactly one argument.
244 op_name: The name of the operator. Used in error messages.
245 node: The expression AST node representing the operator application.
247 Returns:
248 The result of applying op to node's child.
250 Raises:
251 ValueError: The node does not have exactly one child.
253 if len(node.children) != 1:
254 raise ValueError('%s operator must always have one arguments' % op_name)
255 return op(self._Eval(node.children[0]))
257 def _Eval(self, node):
258 """Evaluate an expression node on the document.
260 Args:
261 node: The expression AST node representing an expression subtree.
263 Returns:
264 The Python value that maps to the value of node. Types are inferred from
265 the expression, so expressions with numeric results will return as python
266 int/long/floats, textual results will be strings, and dates will be
267 datetimes.
269 Raises:
270 _ExpressionError: The expression cannot be evaluated on this document
271 because either the expression is malformed or the document does not
272 contain the required fields. Callers of _Eval should catch
273 _ExpressionErrors and optionally log them; these are not fatal in any way,
274 and are used to indicate that this expression should not be set on this
275 document.
277 if node.getType() in self._function_table:
278 func = self._function_table[node.getType()]
281 return func(*node.children)
283 if node.getType() == ExpressionParser.PLUS:
284 return self._EvalBinaryOp(lambda a, b: a + b, 'addition', node)
285 if node.getType() == ExpressionParser.MINUS:
286 return self._EvalBinaryOp(lambda a, b: a - b, 'subtraction', node)
287 if node.getType() == ExpressionParser.DIV:
288 return self._EvalBinaryOp(lambda a, b: a / b, 'division', node)
289 if node.getType() == ExpressionParser.TIMES:
290 return self._EvalBinaryOp(lambda a, b: a * b, 'multiplication', node)
291 if node.getType() == ExpressionParser.NEG:
292 return self._EvalUnaryOp(lambda a: -a, 'negation', node)
294 if node.getType() in (ExpressionParser.INT, ExpressionParser.FLOAT):
295 return float(query_parser.GetQueryNodeText(node))
296 if node.getType() == ExpressionParser.PHRASE:
297 return query_parser.GetQueryNodeText(node).strip('"')
299 if node.getType() == ExpressionParser.NAME:
300 name = query_parser.GetQueryNodeText(node)
301 if name == '_score':
302 return self._doc.score
303 field = search_util.GetFieldInDocument(self._doc_pb, name)
304 if field:
305 return search_util.GetFieldValue(field)
306 raise _ExpressionError('No field %s in document' % name)
308 raise _ExpressionError('Unable to handle node %s' % node)
310 def ValueOf(self, expression, default_value=None):
311 """Returns the value of an expression on a document.
313 Args:
314 expression: The expression string.
315 default_value: The value to return if the expression cannot be evaluated.
317 Returns:
318 The value of the expression on the evaluator's document, or default_value
319 if the expression cannot be evaluated on the document.
321 expression_tree = Parse(expression)
322 if not expression_tree.getType() and expression_tree.children:
323 expression_tree = expression_tree.children[0]
325 result = default_value
326 try:
327 result = self._Eval(expression_tree)
328 except _ExpressionError, e:
331 logging.debug('Skipping expression %s: %s', expression, e)
332 except search_util.UnsupportedOnDevError, e:
335 logging.warning(e.args[0])
337 return result
339 def Evaluate(self, expression):
340 """Evaluates the expression for a document and attaches the result.
342 Args:
343 expression: The Expression protobuffer object.
346 name = expression.name()
347 result = self.ValueOf(expression.expression())
348 if result != None:
349 self._doc.expressions[name] = result
352 def Parse(expression):
353 """Parse an expression and return its parse tree.
355 Args:
356 expression: An expression string.
358 Returns:
359 A parse tree for the expression, as generated by expression_parser.
361 return expression_parser.Parse(expression).tree