python/google/appengine/api/search/stub/expression_evaluator.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright 2007 Google Inc.
   4 #
   5 # Licensed under the Apache License, Version 2.0 (the "License");
   6 # you may not use this file except in compliance with the License.
   7 # You may obtain a copy of the License at
   8 #
   9 #     http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing, software
  12 # distributed under the License is distributed on an "AS IS" BASIS,
  13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 # See the License for the specific language governing permissions and
  15 # limitations under the License.
  16 #
  17
  18
  19 """Expression evaluator for Full Text Search API stub.
  20
  21 An associated ExpressionEvaluator object is created for every scored document in
  22 search results, and that object evaluates all expressions for that document. The
  23 expression syntax is detailed here:
  24
  25 https://developers.google.com/appengine/docs/python/search/overview#Expressions
  26
  27 Usage examples:
  28
  29   # Evaluate one expression for scored_doc
  30   expression = search_service_pb.FieldSpec_Expression()
  31   expression.set_name('total_value')
  32   expression.set_expression('max(0, 3 * value + _score)')
  33   ExpressionEvaluator(scored_doc, inverted_index).Evaluate(expression)
  34   # scored_doc.expressions['total_value'] is now set to the expression result.
  35
  36   # Attach the result of all expressions for documents in scored_docs
  37   for scored_doc in scored_docs:
  38     evaluator = ExpressionEvaluator(scored_doc, inverted_index)
  39     for expression in expression_protos:
  40       evaluator.Evaluate(expression)
  41
  42 Note that this is not used for the production Full Text Search API; this
  43 provides an approximation to the API for local testing with dev_appserver.
  44
  45 """
  46
  47
  48
  49
  50
  51
  52 import logging
  53
  54
  55 from google.appengine.api.search import expression_parser
  56 from google.appengine.api.search import ExpressionParser
  57 from google.appengine.api.search import query_parser
  58 from google.appengine.api.search import search_util
  59 from google.appengine.api.search.stub import simple_tokenizer
  60 from google.appengine.api.search.stub import tokens
  61
  62
  63
  64
  65 _SNIPPET_PREFIX = '...'
  66 _SNIPPET_SUFFIX = '...'
  67
  68
  69 class _ExpressionError(Exception):
  70   """Raised when evaluating an expression fails."""
  71
  72
  73 class ExpressionEvaluator(object):
  74   """Evaluates an expression on scored documents."""
  75
  76   def __init__(self, document, inverted_index):
  77     """Constructor.
  78
  79     Args:
  80       document: The ScoredDocument to evaluate the expression for.
  81       inverted_index: The search index (used for snippeting).
  82     """
  83     self._doc = document
  84     self._doc_pb = document.document
  85     self._inverted_index = inverted_index
  86     self._tokenizer = simple_tokenizer.SimpleTokenizer(preserve_case=False)
  87     self._case_preserving_tokenizer = simple_tokenizer.SimpleTokenizer(
  88         preserve_case=True)
  89     self._function_table = {
  90         'max': self._Max,
  91         'min': self._Min,
  92         'count': self._Count,
  93         'snippet': self._Snippet,
  94         'distance': self._Unsupported('distance'),
  95         }
  96
  97   def _Min(self, *nodes):
  98     return min(self._Eval(node) for node in nodes)
  99
 100   def _Max(self, *nodes):
 101     return max(self._Eval(node) for node in nodes)
 102
 103   def _Count(self, node):
 104     return search_util.GetFieldCountInDocument(
 105         self._doc_pb, query_parser.GetQueryNodeText(node))
 106
 107   def _GenerateSnippet(self, doc_words, position, max_length):
 108     """Generate a snippet that fills a given length from a list of tokens.
 109
 110     Args:
 111       doc_words: A list of tokens from the document.
 112       position: The index of the highlighted word.
 113       max_length: The maximum length of the output snippet.
 114
 115     Returns:
 116       A summary of the given words with the word at index position highlighted.
 117     """
 118     snippet = '<b>%s</b>' % doc_words[position]
 119
 120     next_len, prev_len = 0, 0
 121     if position + 1 < len(doc_words):
 122
 123       next_len = len(doc_words[position+1]) + 1
 124     if position > 0:
 125
 126       prev_len = len(doc_words[position-1]) + 1
 127
 128
 129     i = 1
 130
 131     length_offset = len(_SNIPPET_PREFIX) + len(_SNIPPET_SUFFIX)
 132     while (len(snippet) + next_len + prev_len + length_offset < max_length and
 133            (position + i < len(doc_words) or position - i > 0)):
 134       if position + i < len(doc_words):
 135         snippet = '%s %s' % (snippet, doc_words[position+i])
 136
 137         next_len = len(doc_words[position+i]) + 1
 138       else:
 139         next_len = 0
 140
 141       if position - i >= 0:
 142         snippet = '%s %s' % (doc_words[position-i], snippet)
 143
 144         prev_len = len(doc_words[position-i]) + 1
 145       else:
 146         prev_len = 0
 147
 148       i += 1
 149     return '%s%s%s' % (_SNIPPET_PREFIX, snippet, _SNIPPET_SUFFIX)
 150
 151
 152
 153
 154   def _Snippet(self, query, field, *args):
 155     """Create a snippet given a query and the field to query on.
 156
 157     Args:
 158       query: A query string containing only a bare term (no operators).
 159       field: The field name to query on.
 160       *args: Unused optional arguments. These are not used on dev_appserver.
 161
 162     Returns:
 163       A snippet for the field with the query term bolded.
 164     """
 165     field = query_parser.GetQueryNodeText(field)
 166     terms = self._tokenizer.TokenizeText(
 167         query_parser.GetQueryNodeText(query).strip('"'))
 168     for term in terms:
 169       search_token = tokens.Token(chars=u'%s:%s' % (field, term.chars))
 170       postings = self._inverted_index.GetPostingsForToken(search_token)
 171       for posting in postings:
 172         if posting.doc_id != self._doc_pb.id() or not posting.positions:
 173           continue
 174
 175         field_val = search_util.GetFieldValue(
 176             search_util.GetFieldInDocument(self._doc_pb, field))
 177         if not field_val:
 178           continue
 179         doc_words = [token.chars for token in
 180                      self._case_preserving_tokenizer.TokenizeText(field_val)]
 181
 182         position = posting.positions[0]
 183         return self._GenerateSnippet(
 184             doc_words, position, search_util.DEFAULT_MAX_SNIPPET_LENGTH)
 185       else:
 186         field_val = search_util.GetFieldValue(
 187             search_util.GetFieldInDocument(self._doc_pb, field))
 188         if not field_val:
 189           return None
 190         return '%s...' % field_val[:search_util.DEFAULT_MAX_SNIPPET_LENGTH]
 191
 192   def _Unsupported(self, method):
 193     """Returns a function that raises an unsupported error when called.
 194
 195     This should be used for methods that are not yet implemented in
 196     dev_appserver but are present in the API. If users call this function, the
 197     expression will be skipped and a warning will be logged.
 198
 199     Args:
 200       method: The name of the method that was called (used for logging).
 201
 202     Returns:
 203       A function that raises a UnsupportedOnDevError when called.
 204     """
 205
 206
 207
 208
 209     def RaiseUnsupported(*args):
 210       raise search_util.UnsupportedOnDevError(
 211           '%s is currently unsupported on dev_appserver.' % method)
 212     return RaiseUnsupported
 213
 214   def _EvalBinaryOp(self, op, op_name, node):
 215     """Evaluate a binary operator on the document.
 216
 217     Args:
 218       op: The operator function. Must take exactly two arguments.
 219       op_name: The name of the operator. Used in error messages.
 220       node: The expression AST node representing the operator application.
 221
 222     Returns:
 223       The result of applying op to node's two children.
 224
 225     Raises:
 226       ValueError: The node does not have exactly two children.
 227     """
 228     if len(node.children) != 2:
 229       raise ValueError('%s operator must always have two arguments' % op_name)
 230     n1, n2 = node.children
 231     return op(self._Eval(n1), self._Eval(n2))
 232
 233   def _EvalUnaryOp(self, op, op_name, node):
 234     """Evaluate a unary operator on the document.
 235
 236     Args:
 237       op: The operator function. Must take exactly one argument.
 238       op_name: The name of the operator. Used in error messages.
 239       node: The expression AST node representing the operator application.
 240
 241     Returns:
 242       The result of applying op to node's child.
 243
 244     Raises:
 245       ValueError: The node does not have exactly one child.
 246     """
 247     if len(node.children) != 1:
 248       raise ValueError('%s operator must always have one arguments' % op_name)
 249     return op(self._Eval(node.children[0]))
 250
 251   def _Eval(self, node):
 252     """Evaluate an expression node on the document.
 253
 254     Args:
 255       node: The expression AST node representing an expression subtree.
 256
 257     Returns:
 258       The Python value that maps to the value of node. Types are inferred from
 259       the expression, so expressions with numeric results will return as python
 260       int/long/floats, textual results will be strings, and dates will be
 261       datetimes.
 262
 263     Raises:
 264       _ExpressionError: The expression cannot be evaluated on this document
 265       because either the expression is malformed or the document does not
 266       contain the required fields. Callers of _Eval should catch
 267       _ExpressionErrors and optionally log them; these are not fatal in any way,
 268       and are used to indicate that this expression should not be set on this
 269       document.
 270     """
 271     if node.getType() == ExpressionParser.FN:
 272       func = self._function_table[query_parser.GetQueryNodeText(node)]
 273
 274
 275       return func(*node.children)
 276
 277     if node.getType() == ExpressionParser.PLUS:
 278       return self._EvalBinaryOp(lambda a, b: a + b, 'addition', node)
 279     if node.getType() == ExpressionParser.MINUS:
 280       return self._EvalBinaryOp(lambda a, b: a - b, 'subtraction', node)
 281     if node.getType() == ExpressionParser.DIV:
 282       return self._EvalBinaryOp(lambda a, b: a / b, 'division', node)
 283     if node.getType() == ExpressionParser.TIMES:
 284       return self._EvalBinaryOp(lambda a, b: a * b, 'multiplication', node)
 285     if node.getType() == ExpressionParser.NEG:
 286       return self._EvalUnaryOp(lambda a: -a, 'negation', node)
 287
 288     if node.getType() in (ExpressionParser.INT, ExpressionParser.FLOAT):
 289       return float(query_parser.GetQueryNodeText(node))
 290     if node.getType() == ExpressionParser.PHRASE:
 291       return query_parser.GetQueryNodeText(node).strip('"')
 292
 293     if node.getType() == ExpressionParser.NAME:
 294       name = query_parser.GetQueryNodeText(node)
 295       if name == '_score':
 296         return self._doc.score
 297       field = search_util.GetFieldInDocument(self._doc_pb, name)
 298       if field:
 299         return search_util.GetFieldValue(field)
 300       raise _ExpressionError('No field %s in document' % name)
 301
 302     raise _ExpressionError('Unable to handle node %s' % node)
 303
 304   def ValueOf(self, expression, default_value=None):
 305     """Returns the value of an expression on a document.
 306
 307     Args:
 308       expression: The expression string.
 309       default_value: The value to return if the expression cannot be evaluated.
 310
 311     Returns:
 312       The value of the expression on the evaluator's document, or default_value
 313       if the expression cannot be evaluated on the document.
 314     """
 315     expression_tree = Parse(expression)
 316     if not expression_tree.getType() and expression_tree.children:
 317       expression_tree = expression_tree.children[0]
 318
 319     result = default_value
 320     try:
 321       result = self._Eval(expression_tree)
 322     except _ExpressionError, e:
 323
 324
 325       logging.debug('Skipping expression %s: %s', expression, e)
 326     except search_util.UnsupportedOnDevError, e:
 327
 328
 329       logging.warning(e.args[0])
 330
 331     return result
 332
 333   def Evaluate(self, expression):
 334     """Evaluates the expression for a document and attaches the result.
 335
 336     Args:
 337       expression: The Expression protobuffer object.
 338     """
 339
 340     name = expression.name()
 341     result = self.ValueOf(expression.expression())
 342     if result != None:
 343       self._doc.expressions[name] = result
 344
 345
 346 def Parse(expression):
 347   """Parse an expression and return its parse tree.
 348
 349   Args:
 350     expression: An expression string.
 351
 352   Returns:
 353     A parse tree for the expression, as generated by expression_parser.
 354   """
 355   return expression_parser.Parse(expression).tree