3 # Copyright 2007 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
19 """Expression evaluator for Full Text Search API stub.
21 An associated ExpressionEvaluator object is created for every scored document in
22 search results, and that object evaluates all expressions for that document. The
23 expression syntax is detailed here:
25 https://developers.google.com/appengine/docs/python/search/overview#Expressions
29 # Evaluate one expression for scored_doc
30 expression = search_service_pb.FieldSpec_Expression()
31 expression.set_name('total_value')
32 expression.set_expression('max(0, 3 * value + _score)')
33 ExpressionEvaluator(scored_doc, inverted_index).Evaluate(expression)
34 # scored_doc.expressions['total_value'] is now set to the expression result.
36 # Attach the result of all expressions for documents in scored_docs
37 for scored_doc in scored_docs:
38 evaluator = ExpressionEvaluator(scored_doc, inverted_index)
39 for expression in expression_protos:
40 evaluator.Evaluate(expression)
42 Note that this is not used for the production Full Text Search API; this
43 provides an approximation to the API for local testing with dev_appserver.
55 from google
.appengine
.api
.search
import expression_parser
56 from google
.appengine
.api
.search
import ExpressionParser
57 from google
.appengine
.api
.search
import query_parser
58 from google
.appengine
.api
.search
import search_util
59 from google
.appengine
.api
.search
.stub
import simple_tokenizer
60 from google
.appengine
.api
.search
.stub
import tokens
65 _SNIPPET_PREFIX
= '...'
66 _SNIPPET_SUFFIX
= '...'
69 class _ExpressionError(Exception):
70 """Raised when evaluating an expression fails."""
73 class ExpressionEvaluator(object):
74 """Evaluates an expression on scored documents."""
76 def __init__(self
, document
, inverted_index
):
80 document: The ScoredDocument to evaluate the expression for.
81 inverted_index: The search index (used for snippeting).
84 self
._doc
_pb
= document
.document
85 self
._inverted
_index
= inverted_index
86 self
._tokenizer
= simple_tokenizer
.SimpleTokenizer(preserve_case
=False)
87 self
._case
_preserving
_tokenizer
= simple_tokenizer
.SimpleTokenizer(
89 self
._function
_table
= {
93 'snippet': self
._Snippet
,
94 'distance': self
._Unsupported
('distance'),
97 def _Min(self
, *nodes
):
98 return min(self
._Eval
(node
) for node
in nodes
)
100 def _Max(self
, *nodes
):
101 return max(self
._Eval
(node
) for node
in nodes
)
103 def _Count(self
, node
):
104 return search_util
.GetFieldCountInDocument(
105 self
._doc
_pb
, query_parser
.GetQueryNodeText(node
))
107 def _GenerateSnippet(self
, doc_words
, position
, max_length
):
108 """Generate a snippet that fills a given length from a list of tokens.
111 doc_words: A list of tokens from the document.
112 position: The index of the highlighted word.
113 max_length: The maximum length of the output snippet.
116 A summary of the given words with the word at index position highlighted.
118 snippet
= '<b>%s</b>' % doc_words
[position
]
120 next_len
, prev_len
= 0, 0
121 if position
+ 1 < len(doc_words
):
123 next_len
= len(doc_words
[position
+1]) + 1
126 prev_len
= len(doc_words
[position
-1]) + 1
131 length_offset
= len(_SNIPPET_PREFIX
) + len(_SNIPPET_SUFFIX
)
132 while (len(snippet
) + next_len
+ prev_len
+ length_offset
< max_length
and
133 (position
+ i
< len(doc_words
) or position
- i
> 0)):
134 if position
+ i
< len(doc_words
):
135 snippet
= '%s %s' % (snippet
, doc_words
[position
+i
])
137 next_len
= len(doc_words
[position
+i
]) + 1
141 if position
- i
>= 0:
142 snippet
= '%s %s' % (doc_words
[position
-i
], snippet
)
144 prev_len
= len(doc_words
[position
-i
]) + 1
149 return '%s%s%s' % (_SNIPPET_PREFIX
, snippet
, _SNIPPET_SUFFIX
)
154 def _Snippet(self
, query
, field
, *args
):
155 """Create a snippet given a query and the field to query on.
158 query: A query string containing only a bare term (no operators).
159 field: The field name to query on.
160 *args: Unused optional arguments. These are not used on dev_appserver.
163 A snippet for the field with the query term bolded.
165 field
= query_parser
.GetQueryNodeText(field
)
166 terms
= self
._tokenizer
.TokenizeText(
167 query_parser
.GetQueryNodeText(query
).strip('"'))
169 search_token
= tokens
.Token(chars
=u
'%s:%s' % (field
, term
.chars
))
170 postings
= self
._inverted
_index
.GetPostingsForToken(search_token
)
171 for posting
in postings
:
172 if posting
.doc_id
!= self
._doc
_pb
.id() or not posting
.positions
:
175 field_val
= search_util
.GetFieldValue(
176 search_util
.GetFieldInDocument(self
._doc
_pb
, field
))
179 doc_words
= [token
.chars
for token
in
180 self
._case
_preserving
_tokenizer
.TokenizeText(field_val
)]
182 position
= posting
.positions
[0]
183 return self
._GenerateSnippet
(
184 doc_words
, position
, search_util
.DEFAULT_MAX_SNIPPET_LENGTH
)
186 field_val
= search_util
.GetFieldValue(
187 search_util
.GetFieldInDocument(self
._doc
_pb
, field
))
190 return '%s...' % field_val
[:search_util
.DEFAULT_MAX_SNIPPET_LENGTH
]
192 def _Unsupported(self
, method
):
193 """Returns a function that raises an unsupported error when called.
195 This should be used for methods that are not yet implemented in
196 dev_appserver but are present in the API. If users call this function, the
197 expression will be skipped and a warning will be logged.
200 method: The name of the method that was called (used for logging).
203 A function that raises a UnsupportedOnDevError when called.
209 def RaiseUnsupported(*args
):
210 raise search_util
.UnsupportedOnDevError(
211 '%s is currently unsupported on dev_appserver.' % method
)
212 return RaiseUnsupported
214 def _EvalBinaryOp(self
, op
, op_name
, node
):
215 """Evaluate a binary operator on the document.
218 op: The operator function. Must take exactly two arguments.
219 op_name: The name of the operator. Used in error messages.
220 node: The expression AST node representing the operator application.
223 The result of applying op to node's two children.
226 ValueError: The node does not have exactly two children.
228 if len(node
.children
) != 2:
229 raise ValueError('%s operator must always have two arguments' % op_name
)
230 n1
, n2
= node
.children
231 return op(self
._Eval
(n1
), self
._Eval
(n2
))
233 def _EvalUnaryOp(self
, op
, op_name
, node
):
234 """Evaluate a unary operator on the document.
237 op: The operator function. Must take exactly one argument.
238 op_name: The name of the operator. Used in error messages.
239 node: The expression AST node representing the operator application.
242 The result of applying op to node's child.
245 ValueError: The node does not have exactly one child.
247 if len(node
.children
) != 1:
248 raise ValueError('%s operator must always have one arguments' % op_name
)
249 return op(self
._Eval
(node
.children
[0]))
251 def _Eval(self
, node
):
252 """Evaluate an expression node on the document.
255 node: The expression AST node representing an expression subtree.
258 The Python value that maps to the value of node. Types are inferred from
259 the expression, so expressions with numeric results will return as python
260 int/long/floats, textual results will be strings, and dates will be
264 _ExpressionError: The expression cannot be evaluated on this document
265 because either the expression is malformed or the document does not
266 contain the required fields. Callers of _Eval should catch
267 _ExpressionErrors and optionally log them; these are not fatal in any way,
268 and are used to indicate that this expression should not be set on this
271 if node
.getType() == ExpressionParser
.FN
:
272 func
= self
._function
_table
[query_parser
.GetQueryNodeText(node
)]
275 return func(*node
.children
)
277 if node
.getType() == ExpressionParser
.PLUS
:
278 return self
._EvalBinaryOp
(lambda a
, b
: a
+ b
, 'addition', node
)
279 if node
.getType() == ExpressionParser
.MINUS
:
280 return self
._EvalBinaryOp
(lambda a
, b
: a
- b
, 'subtraction', node
)
281 if node
.getType() == ExpressionParser
.DIV
:
282 return self
._EvalBinaryOp
(lambda a
, b
: a
/ b
, 'division', node
)
283 if node
.getType() == ExpressionParser
.TIMES
:
284 return self
._EvalBinaryOp
(lambda a
, b
: a
* b
, 'multiplication', node
)
285 if node
.getType() == ExpressionParser
.NEG
:
286 return self
._EvalUnaryOp
(lambda a
: -a
, 'negation', node
)
288 if node
.getType() in (ExpressionParser
.INT
, ExpressionParser
.FLOAT
):
289 return float(query_parser
.GetQueryNodeText(node
))
290 if node
.getType() == ExpressionParser
.PHRASE
:
291 return query_parser
.GetQueryNodeText(node
).strip('"')
293 if node
.getType() == ExpressionParser
.NAME
:
294 name
= query_parser
.GetQueryNodeText(node
)
296 return self
._doc
.score
297 field
= search_util
.GetFieldInDocument(self
._doc
_pb
, name
)
299 return search_util
.GetFieldValue(field
)
300 raise _ExpressionError('No field %s in document' % name
)
302 raise _ExpressionError('Unable to handle node %s' % node
)
304 def ValueOf(self
, expression
, default_value
=None):
305 """Returns the value of an expression on a document.
308 expression: The expression string.
309 default_value: The value to return if the expression cannot be evaluated.
312 The value of the expression on the evaluator's document, or default_value
313 if the expression cannot be evaluated on the document.
315 expression_tree
= Parse(expression
)
316 if not expression_tree
.getType() and expression_tree
.children
:
317 expression_tree
= expression_tree
.children
[0]
319 result
= default_value
321 result
= self
._Eval
(expression_tree
)
322 except _ExpressionError
, e
:
325 logging
.debug('Skipping expression %s: %s', expression
, e
)
326 except search_util
.UnsupportedOnDevError
, e
:
329 logging
.warning(e
.args
[0])
333 def Evaluate(self
, expression
):
334 """Evaluates the expression for a document and attaches the result.
337 expression: The Expression protobuffer object.
340 name
= expression
.name()
341 result
= self
.ValueOf(expression
.expression())
343 self
._doc
.expressions
[name
] = result
346 def Parse(expression
):
347 """Parse an expression and return its parse tree.
350 expression: An expression string.
353 A parse tree for the expression, as generated by expression_parser.
355 return expression_parser
.Parse(expression
).tree