python/google/appengine/api/search/stub/expression_evaluator.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright 2007 Google Inc.
   4 #
   5 # Licensed under the Apache License, Version 2.0 (the "License");
   6 # you may not use this file except in compliance with the License.
   7 # You may obtain a copy of the License at
   8 #
   9 #     http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing, software
  12 # distributed under the License is distributed on an "AS IS" BASIS,
  13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 # See the License for the specific language governing permissions and
  15 # limitations under the License.
  16 #
  17
  18
  19 """Expression evaluator for Full Text Search API stub.
  20
  21 An associated ExpressionEvaluator object is created for every scored document in
  22 search results, and that object evaluates all expressions for that document. The
  23 expression syntax is detailed here:
  24
  25 https://developers.google.com/appengine/docs/python/search/overview#Expressions
  26
  27 Usage examples:
  28
  29   # Evaluate one expression for scored_doc
  30   expression = search_service_pb.FieldSpec_Expression()
  31   expression.set_name('total_value')
  32   expression.set_expression('max(0, 3 * value + _score)')
  33   ExpressionEvaluator(scored_doc, inverted_index).Evaluate(expression)
  34   # scored_doc.expressions['total_value'] is now set to the expression result.
  35
  36   # Attach the result of all expressions for documents in scored_docs
  37   for scored_doc in scored_docs:
  38     evaluator = ExpressionEvaluator(scored_doc, inverted_index)
  39     for expression in expression_protos:
  40       evaluator.Evaluate(expression)
  41
  42 Note that this is not used for the production Full Text Search API; this
  43 provides an approximation to the API for local testing with dev_appserver.
  44
  45 """
  46
  47
  48
  49
  50
  51
  52 import logging
  53
  54
  55 from google.appengine.api.search import expression_parser
  56 from google.appengine.api.search import ExpressionParser
  57 from google.appengine.api.search import query_parser
  58 from google.appengine.api.search import search_util
  59 from google.appengine.api.search.stub import simple_tokenizer
  60 from google.appengine.api.search.stub import tokens
  61
  62
  63
  64
  65 _SNIPPET_PREFIX = '...'
  66 _SNIPPET_SUFFIX = '...'
  67
  68
  69 class _ExpressionError(Exception):
  70   """Raised when evaluating an expression fails."""
  71
  72
  73 class ExpressionEvaluator(object):
  74   """Evaluates an expression on scored documents."""
  75
  76   def __init__(self, document, inverted_index):
  77     """Constructor.
  78
  79     Args:
  80       document: The ScoredDocument to evaluate the expression for.
  81       inverted_index: The search index (used for snippeting).
  82     """
  83     self._doc = document
  84     self._doc_pb = document.document
  85     self._inverted_index = inverted_index
  86     self._tokenizer = simple_tokenizer.SimpleTokenizer(preserve_case=False)
  87     self._case_preserving_tokenizer = simple_tokenizer.SimpleTokenizer(
  88         preserve_case=True)
  89     self._function_table = {
  90         ExpressionParser.ABS: self._Unsupported('abs'),
  91         ExpressionParser.COUNT: self._Count,
  92         ExpressionParser.DISTANCE: self._Unsupported('distance'),
  93         ExpressionParser.GEOPOINT: self._Unsupported('geopoint'),
  94         ExpressionParser.LEN: self._Unsupported('len'),
  95         ExpressionParser.LOG: self._Unsupported('log'),
  96         ExpressionParser.MAX: self._Max,
  97         ExpressionParser.MIN: self._Min,
  98         ExpressionParser.POW: self._Unsupported('pow'),
  99         ExpressionParser.SNIPPET: self._Snippet,
 100         ExpressionParser.SWITCH: self._Unsupported('switch'),
 101         }
 102
 103   def _Min(self, *nodes):
 104     return min(self._Eval(node) for node in nodes)
 105
 106   def _Max(self, *nodes):
 107     return max(self._Eval(node) for node in nodes)
 108
 109   def _Count(self, node):
 110     return search_util.GetFieldCountInDocument(
 111         self._doc_pb, query_parser.GetQueryNodeText(node))
 112
 113   def _GenerateSnippet(self, doc_words, position, max_length):
 114     """Generate a snippet that fills a given length from a list of tokens.
 115
 116     Args:
 117       doc_words: A list of tokens from the document.
 118       position: The index of the highlighted word.
 119       max_length: The maximum length of the output snippet.
 120
 121     Returns:
 122       A summary of the given words with the word at index position highlighted.
 123     """
 124     snippet = '<b>%s</b>' % doc_words[position]
 125
 126     next_len, prev_len = 0, 0
 127     if position + 1 < len(doc_words):
 128
 129       next_len = len(doc_words[position+1]) + 1
 130     if position > 0:
 131
 132       prev_len = len(doc_words[position-1]) + 1
 133
 134
 135     i = 1
 136
 137     length_offset = len(_SNIPPET_PREFIX) + len(_SNIPPET_SUFFIX)
 138     while (len(snippet) + next_len + prev_len + length_offset < max_length and
 139            (position + i < len(doc_words) or position - i > 0)):
 140       if position + i < len(doc_words):
 141         snippet = '%s %s' % (snippet, doc_words[position+i])
 142
 143         next_len = len(doc_words[position+i]) + 1
 144       else:
 145         next_len = 0
 146
 147       if position - i >= 0:
 148         snippet = '%s %s' % (doc_words[position-i], snippet)
 149
 150         prev_len = len(doc_words[position-i]) + 1
 151       else:
 152         prev_len = 0
 153
 154       i += 1
 155     return '%s%s%s' % (_SNIPPET_PREFIX, snippet, _SNIPPET_SUFFIX)
 156
 157
 158
 159
 160   def _Snippet(self, query, field, *args):
 161     """Create a snippet given a query and the field to query on.
 162
 163     Args:
 164       query: A query string containing only a bare term (no operators).
 165       field: The field name to query on.
 166       *args: Unused optional arguments. These are not used on dev_appserver.
 167
 168     Returns:
 169       A snippet for the field with the query term bolded.
 170     """
 171     field = query_parser.GetQueryNodeText(field)
 172     terms = self._tokenizer.TokenizeText(
 173         query_parser.GetQueryNodeText(query).strip('"'))
 174     for term in terms:
 175       search_token = tokens.Token(chars=u'%s:%s' % (field, term.chars))
 176       postings = self._inverted_index.GetPostingsForToken(search_token)
 177       for posting in postings:
 178         if posting.doc_id != self._doc_pb.id() or not posting.positions:
 179           continue
 180
 181         field_val = search_util.GetFieldValue(
 182             search_util.GetFieldInDocument(self._doc_pb, field))
 183         if not field_val:
 184           continue
 185         doc_words = [token.chars for token in
 186                      self._case_preserving_tokenizer.TokenizeText(field_val)]
 187
 188         position = posting.positions[0]
 189         return self._GenerateSnippet(
 190             doc_words, position, search_util.DEFAULT_MAX_SNIPPET_LENGTH)
 191       else:
 192         field_val = search_util.GetFieldValue(
 193             search_util.GetFieldInDocument(self._doc_pb, field))
 194         if not field_val:
 195           return ''
 196         return '%s...' % field_val[:search_util.DEFAULT_MAX_SNIPPET_LENGTH]
 197
 198   def _Unsupported(self, method):
 199     """Returns a function that raises an unsupported error when called.
 200
 201     This should be used for methods that are not yet implemented in
 202     dev_appserver but are present in the API. If users call this function, the
 203     expression will be skipped and a warning will be logged.
 204
 205     Args:
 206       method: The name of the method that was called (used for logging).
 207
 208     Returns:
 209       A function that raises a UnsupportedOnDevError when called.
 210     """
 211
 212
 213
 214
 215     def RaiseUnsupported(*args):
 216       raise search_util.UnsupportedOnDevError(
 217           '%s is currently unsupported on dev_appserver.' % method)
 218     return RaiseUnsupported
 219
 220   def _EvalBinaryOp(self, op, op_name, node):
 221     """Evaluate a binary operator on the document.
 222
 223     Args:
 224       op: The operator function. Must take exactly two arguments.
 225       op_name: The name of the operator. Used in error messages.
 226       node: The expression AST node representing the operator application.
 227
 228     Returns:
 229       The result of applying op to node's two children.
 230
 231     Raises:
 232       ValueError: The node does not have exactly two children.
 233     """
 234     if len(node.children) != 2:
 235       raise ValueError('%s operator must always have two arguments' % op_name)
 236     n1, n2 = node.children
 237     return op(self._Eval(n1), self._Eval(n2))
 238
 239   def _EvalUnaryOp(self, op, op_name, node):
 240     """Evaluate a unary operator on the document.
 241
 242     Args:
 243       op: The operator function. Must take exactly one argument.
 244       op_name: The name of the operator. Used in error messages.
 245       node: The expression AST node representing the operator application.
 246
 247     Returns:
 248       The result of applying op to node's child.
 249
 250     Raises:
 251       ValueError: The node does not have exactly one child.
 252     """
 253     if len(node.children) != 1:
 254       raise ValueError('%s operator must always have one arguments' % op_name)
 255     return op(self._Eval(node.children[0]))
 256
 257   def _Eval(self, node):
 258     """Evaluate an expression node on the document.
 259
 260     Args:
 261       node: The expression AST node representing an expression subtree.
 262
 263     Returns:
 264       The Python value that maps to the value of node. Types are inferred from
 265       the expression, so expressions with numeric results will return as python
 266       int/long/floats, textual results will be strings, and dates will be
 267       datetimes.
 268
 269     Raises:
 270       _ExpressionError: The expression cannot be evaluated on this document
 271       because either the expression is malformed or the document does not
 272       contain the required fields. Callers of _Eval should catch
 273       _ExpressionErrors and optionally log them; these are not fatal in any way,
 274       and are used to indicate that this expression should not be set on this
 275       document.
 276     """
 277     if node.getType() in self._function_table:
 278       func = self._function_table[node.getType()]
 279
 280
 281       return func(*node.children)
 282
 283     if node.getType() == ExpressionParser.PLUS:
 284       return self._EvalBinaryOp(lambda a, b: a + b, 'addition', node)
 285     if node.getType() == ExpressionParser.MINUS:
 286       return self._EvalBinaryOp(lambda a, b: a - b, 'subtraction', node)
 287     if node.getType() == ExpressionParser.DIV:
 288       return self._EvalBinaryOp(lambda a, b: a / b, 'division', node)
 289     if node.getType() == ExpressionParser.TIMES:
 290       return self._EvalBinaryOp(lambda a, b: a * b, 'multiplication', node)
 291     if node.getType() == ExpressionParser.NEG:
 292       return self._EvalUnaryOp(lambda a: -a, 'negation', node)
 293
 294     if node.getType() in (ExpressionParser.INT, ExpressionParser.FLOAT):
 295       return float(query_parser.GetQueryNodeText(node))
 296     if node.getType() == ExpressionParser.PHRASE:
 297       return query_parser.GetQueryNodeText(node).strip('"')
 298
 299     if node.getType() == ExpressionParser.NAME:
 300       name = query_parser.GetQueryNodeText(node)
 301       if name == '_score':
 302         return self._doc.score
 303       field = search_util.GetFieldInDocument(self._doc_pb, name)
 304       if field:
 305         return search_util.GetFieldValue(field)
 306       raise _ExpressionError('No field %s in document' % name)
 307
 308     raise _ExpressionError('Unable to handle node %s' % node)
 309
 310   def ValueOf(self, expression, default_value=None):
 311     """Returns the value of an expression on a document.
 312
 313     Args:
 314       expression: The expression string.
 315       default_value: The value to return if the expression cannot be evaluated.
 316
 317     Returns:
 318       The value of the expression on the evaluator's document, or default_value
 319       if the expression cannot be evaluated on the document.
 320     """
 321     expression_tree = Parse(expression)
 322     if not expression_tree.getType() and expression_tree.children:
 323       expression_tree = expression_tree.children[0]
 324
 325     result = default_value
 326     try:
 327       result = self._Eval(expression_tree)
 328     except _ExpressionError, e:
 329
 330
 331       logging.debug('Skipping expression %s: %s', expression, e)
 332     except search_util.UnsupportedOnDevError, e:
 333
 334
 335       logging.warning(e.args[0])
 336
 337     return result
 338
 339   def Evaluate(self, expression):
 340     """Evaluates the expression for a document and attaches the result.
 341
 342     Args:
 343       expression: The Expression protobuffer object.
 344     """
 345
 346     name = expression.name()
 347     result = self.ValueOf(expression.expression())
 348     if result != None:
 349       self._doc.expressions[name] = result
 350
 351
 352 def Parse(expression):
 353   """Parse an expression and return its parse tree.
 354
 355   Args:
 356     expression: An expression string.
 357
 358   Returns:
 359     A parse tree for the expression, as generated by expression_parser.
 360   """
 361   return expression_parser.Parse(expression).tree