App Engine Python SDK version 1.9.12
[gae.git] / python / google / appengine / api / search / simple_search_stub.py
blobad0a0a6b50fe7e77a8218d2a14cac82026fd243f
1 #!/usr/bin/env python
3 # Copyright 2007 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
21 """Simple RAM backed Search API stub."""
32 import base64
33 import bisect
34 import copy
35 import cPickle as pickle
36 import datetime
37 import functools
38 import hashlib
39 import logging
40 import math
41 import os
42 import random
43 import string
44 import tempfile
45 import threading
46 import urllib
47 import uuid
49 from google.appengine.datastore import document_pb
50 from google.appengine.api import apiproxy_stub
51 from google.appengine.api.namespace_manager import namespace_manager
52 from google.appengine.api.search import query_parser
53 from google.appengine.api.search import QueryParser
54 from google.appengine.api.search import search
55 from google.appengine.api.search import search_service_pb
56 from google.appengine.api.search import search_util
57 from google.appengine.api.search.stub import document_matcher
58 from google.appengine.api.search.stub import expression_evaluator
59 from google.appengine.api.search.stub import simple_tokenizer
60 from google.appengine.api.search.stub import tokens
61 from google.appengine.runtime import apiproxy_errors
63 __all__ = ['IndexConsistencyError',
64 'Posting',
65 'PostingList',
66 'RamInvertedIndex',
67 'SearchServiceStub',
68 'SimpleIndex',
69 'FieldTypesDict',
72 _VISIBLE_PRINTABLE_ASCII = frozenset(
73 set(string.printable) - set(string.whitespace))
75 _FAILED_TO_PARSE_SEARCH_REQUEST = 'Failed to parse search request \"%s\"; %s'
77 class _InvalidCursorException(Exception):
78 """Raised when parsing a cursor fails."""
80 class IndexConsistencyError(Exception):
81 """Deprecated 1.7.7. Accessed index with same name different consistency."""
84 class Posting(object):
85 """Represents a occurrences of some token at positions in a document."""
87 def __init__(self, doc_id):
88 """Initializer.
90 Args:
91 doc_id: The identifier of the document with token occurrences.
93 Raises:
94 TypeError: If an unknown argument is passed.
95 """
96 self._doc_id = doc_id
97 self._positions = []
99 @property
100 def doc_id(self):
101 """Return id of the document that the token occurred in."""
102 return self._doc_id
104 def AddPosition(self, position):
105 """Adds the position in token sequence to occurrences for token."""
106 pos = bisect.bisect_left(self._positions, position)
107 if pos < len(self._positions) and self._positions[pos] == position:
108 return
109 self._positions.insert(pos, position)
111 def RemovePosition(self, position):
112 """Removes the position in token sequence from occurrences for token."""
113 pos = bisect.bisect_left(self._positions, position)
114 if pos < len(self._positions) and self._positions[pos] == position:
115 del self._positions[pos]
117 def __cmp__(self, other):
118 if not isinstance(other, Posting):
119 return -2
120 return cmp(self.doc_id, other.doc_id)
122 @property
123 def positions(self):
124 return self._positions
126 def __repr__(self):
127 return search_util.Repr(
128 self, [('doc_id', self.doc_id), ('positions', self.positions)])
131 class PostingList(object):
132 """Represents ordered positions of some token in document.
134 A PostingList consists of a document id and a sequence of positions
135 that the same token occurs in the document.
138 def __init__(self):
139 self._postings = []
141 def Add(self, doc_id, position):
142 """Adds the token position for the given doc_id."""
143 posting = Posting(doc_id=doc_id)
144 pos = bisect.bisect_left(self._postings, posting)
145 if pos < len(self._postings) and self._postings[
146 pos].doc_id == posting.doc_id:
147 posting = self._postings[pos]
148 else:
149 self._postings.insert(pos, posting)
150 posting.AddPosition(position)
152 def Remove(self, doc_id, position):
153 """Removes the token position for the given doc_id."""
154 posting = Posting(doc_id=doc_id)
155 pos = bisect.bisect_left(self._postings, posting)
156 if pos < len(self._postings) and self._postings[
157 pos].doc_id == posting.doc_id:
158 posting = self._postings[pos]
159 posting.RemovePosition(position)
160 if not posting.positions:
161 del self._postings[pos]
163 @property
164 def postings(self):
165 return self._postings
167 def __iter__(self):
168 return iter(self._postings)
170 def __repr__(self):
171 return search_util.Repr(self, [('postings', self.postings)])
174 class _ScoredDocument(object):
175 """A scored document_pb.Document."""
177 def __init__(self, document, score):
178 self._document = document
179 self._score = score
180 self._expressions = {}
182 @property
183 def document(self):
184 return self._document
186 @property
187 def score(self):
188 return self._score
190 @property
191 def expressions(self):
192 return self._expressions
194 def __repr__(self):
195 return search_util.Repr(
196 self, [('document', self.document), ('score', self.score)])
199 class _DocumentStatistics(object):
200 """Statistics about terms occuring in a document."""
202 def __init__(self):
203 self._term_stats = {}
205 def __iter__(self):
206 for item in self._term_stats.items():
207 yield item
209 def IncrementTermCount(self, term):
210 """Adds an occurrence of the term to the stats for the document."""
211 count = 0
212 if term in self._term_stats:
213 count = self._term_stats[term]
214 count += 1
215 self._term_stats[term] = count
217 def TermFrequency(self, term):
218 """Returns the term frequency in the document."""
219 if term not in self._term_stats:
220 return 0
221 return self._term_stats[term]
223 @property
224 def term_stats(self):
225 """Returns the collection of term frequencies in the document."""
226 return self._term_stats
228 def __eq__(self, other):
229 return self.term_stats == other.term_stats
231 def __hash__(self):
232 return hash(self.term_stats)
234 def __repr__(self):
235 return search_util.Repr(self, [('term_stats', self.term_stats)])
238 class FieldTypesDict(object):
239 """Dictionary-like object for type mappings."""
241 def __init__(self):
242 self._field_types = []
244 def __contains__(self, name):
245 return name in [ f.name() for f in self._field_types ]
247 def __getitem__(self, name):
248 for f in self._field_types:
249 if name == f.name():
250 return f
251 raise KeyError, name
253 def IsType(self, name, field_type):
254 if name not in self:
255 return False
256 schema_type = self[name]
257 return field_type in schema_type.type_list()
259 def AddFieldType(self, name, field_type):
260 field_types = None
261 for f in self._field_types:
262 if name == f.name():
263 field_types = f
264 if field_types is None:
265 field_types = document_pb.FieldTypes()
266 field_types.set_name(name)
267 self._field_types.append(field_types)
268 if field_type not in field_types.type_list():
269 field_types.add_type(field_type)
271 def __iter__(self):
272 return iter(sorted([f.name() for f in self._field_types]))
274 def __repr__(self):
275 return repr(self._field_types)
277 class RamInvertedIndex(object):
278 """A simple RAM-resident inverted file over documents."""
280 def __init__(self, tokenizer):
281 self._tokenizer = tokenizer
282 self._inverted_index = {}
283 self._schema = FieldTypesDict()
284 self._document_ids = set([])
286 def _AddDocumentId(self, doc_id):
287 """Adds the doc_id to set in index."""
288 self._document_ids.add(doc_id)
290 def _RemoveDocumentId(self, doc_id):
291 """Removes the doc_id from the set in index."""
292 if doc_id in self._document_ids:
293 self._document_ids.remove(doc_id)
295 @property
296 def document_count(self):
297 return len(self._document_ids)
299 def _AddFieldType(self, name, field_type):
300 """Adds the type to the list supported for a named field."""
301 self._schema.AddFieldType(name, field_type)
303 def GetDocumentStats(self, document):
304 """Gets statistics about occurrences of terms in document."""
305 document_stats = _DocumentStatistics()
306 for field in document.field_list():
307 for token in self._tokenizer.TokenizeValue(field_value=field.value()):
308 document_stats.IncrementTermCount(token.chars)
309 return document_stats
311 def AddDocument(self, doc_id, document):
312 """Adds a document into the index."""
313 token_position = 0
314 for field in document.field_list():
315 self._AddFieldType(field.name(), field.value().type())
316 self._AddTokens(doc_id, field.name(), field.value(), token_position)
317 self._AddDocumentId(doc_id)
319 def RemoveDocument(self, document):
320 """Removes a document from the index."""
321 doc_id = document.id()
322 for field in document.field_list():
323 self._RemoveTokens(doc_id, field.name(), field.value())
324 self._RemoveDocumentId(doc_id)
326 def _AddTokens(self, doc_id, field_name, field_value, token_position):
327 """Adds token occurrences for a given doc's field value."""
328 for token in self._tokenizer.TokenizeValue(field_value, token_position):
329 self._AddToken(doc_id, token)
330 self._AddToken(doc_id, token.RestrictField(field_name))
332 def _RemoveTokens(self, doc_id, field_name, field_value):
333 """Removes tokens occurrences for a given doc's field value."""
334 for token in self._tokenizer.TokenizeValue(field_value=field_value):
335 self._RemoveToken(doc_id, token)
336 self._RemoveToken(doc_id, token.RestrictField(field_name))
338 def _AddToken(self, doc_id, token):
339 """Adds a token occurrence for a document."""
340 postings = self._inverted_index.get(token)
341 if postings is None:
342 self._inverted_index[token] = postings = PostingList()
343 postings.Add(doc_id, token.position)
345 def _RemoveToken(self, doc_id, token):
346 """Removes a token occurrence for a document."""
347 if token in self._inverted_index:
348 postings = self._inverted_index[token]
349 postings.Remove(doc_id, token.position)
350 if not postings.postings:
351 del self._inverted_index[token]
353 def GetPostingsForToken(self, token):
354 """Returns all document postings which for the token."""
355 if token in self._inverted_index:
356 return self._inverted_index[token].postings
357 return []
359 def GetSchema(self):
360 """Returns the schema for the index."""
361 return self._schema
363 def __repr__(self):
364 return search_util.Repr(self, [('_inverted_index', self._inverted_index),
365 ('_schema', self._schema),
366 ('document_count', self.document_count)])
369 def _ScoreRequested(params):
370 """Returns True if match scoring requested, False otherwise."""
371 return params.has_scorer_spec() and params.scorer_spec().has_scorer()
374 class SimpleIndex(object):
375 """A simple search service which uses a RAM-resident inverted file."""
377 def __init__(self, index_spec):
378 self._index_spec = index_spec
379 self._documents = {}
380 self._parser = simple_tokenizer.SimpleTokenizer(split_restricts=False)
381 self._inverted_index = RamInvertedIndex(simple_tokenizer.SimpleTokenizer())
383 @property
384 def index_spec(self):
385 """Returns the index specification for the index."""
386 return self._index_spec
388 def IndexDocuments(self, documents, response):
389 """Indexes an iterable DocumentPb.Document."""
390 for document in documents:
391 doc_id = document.id()
392 if not doc_id:
393 doc_id = str(uuid.uuid4())
394 document.set_id(doc_id)
402 try:
403 search._NewDocumentFromPb(document)
404 except ValueError, e:
405 new_status = response.add_status()
406 new_status.set_code(search_service_pb.SearchServiceError.INVALID_REQUEST)
407 new_status.set_error_detail(e.message)
408 continue
409 response.add_doc_id(doc_id)
410 if doc_id in self._documents:
411 old_document = self._documents[doc_id]
412 self._inverted_index.RemoveDocument(old_document)
413 self._documents[doc_id] = document
414 new_status = response.add_status()
415 new_status.set_code(search_service_pb.SearchServiceError.OK)
416 self._inverted_index.AddDocument(doc_id, document)
418 def DeleteDocuments(self, document_ids, response):
419 """Deletes documents for the given document_ids."""
420 for document_id in document_ids:
421 self.DeleteDocument(document_id, response.add_status())
423 def DeleteDocument(self, document_id, delete_status):
424 """Deletes the document, if any, with the given document_id."""
425 if document_id in self._documents:
426 document = self._documents[document_id]
427 self._inverted_index.RemoveDocument(document)
428 del self._documents[document_id]
429 delete_status.set_code(search_service_pb.SearchServiceError.OK)
430 else:
431 delete_status.set_code(search_service_pb.SearchServiceError.OK)
432 delete_status.set_error_detail('Not found')
434 def Documents(self):
435 """Returns the documents in the index."""
436 return self._documents.values()
438 def _TermFrequency(self, term, document):
439 """Return the term frequency in the document."""
440 return self._inverted_index.GetDocumentStats(document).TermFrequency(term)
442 @property
443 def document_count(self):
444 """Returns the count of documents in the index."""
445 return self._inverted_index.document_count
447 def _DocumentCountForTerm(self, term):
448 """Returns the document count for documents containing the term."""
449 return len(self._PostingsForToken(tokens.Token(chars=term)))
451 def _InverseDocumentFrequency(self, term):
452 """Returns inverse document frequency of term."""
453 doc_count = self._DocumentCountForTerm(term)
454 if doc_count:
455 return math.log10(self.document_count / float(doc_count))
456 else:
457 return 0
459 def _TermFrequencyInverseDocumentFrequency(self, term, document):
460 """Returns the term frequency times inverse document frequency of term."""
461 return (self._TermFrequency(term, document) *
462 self._InverseDocumentFrequency(term))
464 def _ScoreDocument(self, document, score, terms):
465 """Scores a document for the given query."""
466 if not score:
467 return 0
468 tf_idf = 0
469 for term in terms:
470 tf_idf += self._TermFrequencyInverseDocumentFrequency(term, document)
471 return tf_idf
473 def _PostingsForToken(self, token):
474 """Returns the postings for the token."""
475 return self._inverted_index.GetPostingsForToken(token)
477 def _CollectTerms(self, node):
478 """Get all search terms for scoring."""
479 if node.getType() in search_util.TEXT_QUERY_TYPES:
480 return set([query_parser.GetQueryNodeText(node).strip('"')])
481 elif node.children:
482 if node.getType() == QueryParser.EQ and len(node.children) > 1:
483 children = node.children[1:]
484 else:
485 children = node.children
487 result = set()
488 for term_set in (self._CollectTerms(child) for child in children):
489 result.update(term_set)
490 return result
491 return set()
493 def _CollectFields(self, node):
494 if node.getType() == QueryParser.EQ and node.children:
495 return set([query_parser.GetQueryNodeText(node.children[0])])
496 elif node.children:
497 result = set()
498 for term_set in (self._CollectFields(child) for child in node.children):
499 result.update(term_set)
500 return result
501 return set()
503 def _Evaluate(self, node, score=True):
504 """Retrieve scored results for a search query."""
505 doc_match = document_matcher.DocumentMatcher(node, self._inverted_index)
507 matched_documents = doc_match.FilterDocuments(self._documents.itervalues())
508 terms = self._CollectTerms(node)
509 scored_documents = [
510 _ScoredDocument(doc, self._ScoreDocument(doc, score, terms))
511 for doc in matched_documents]
512 return scored_documents
514 def _Sort(self, docs, search_params, query, score):
515 """Return sorted docs with score or evaluated search_params as sort key."""
519 docs = sorted(docs, key=lambda doc: doc.document.order_id(), reverse=True)
521 if not search_params.sort_spec_size():
522 if score:
523 return sorted(docs, key=lambda doc: doc.score, reverse=True)
524 return docs
526 def SortKey(scored_doc):
527 """Return the sort key for a document based on the request parameters.
529 Arguments:
530 scored_doc: The document to score
532 Returns:
533 The sort key of a document. The sort key is a tuple, where the nth
534 element in the tuple corresponds to the value of the nth sort expression
535 evaluated on the document.
537 Raises:
538 Exception: if no default value is specified.
540 expr_vals = []
541 for sort_spec in search_params.sort_spec_list():
542 default_text = None
543 default_numeric = None
544 if sort_spec.has_default_value_text():
545 default_text = sort_spec.default_value_text()
546 if sort_spec.has_default_value_numeric():
547 default_numeric = sort_spec.default_value_numeric()
548 try:
549 text_val = expression_evaluator.ExpressionEvaluator(
550 scored_doc, self._inverted_index, True).ValueOf(
551 sort_spec.sort_expression(), default_value=default_text,
552 return_type=search_util.EXPRESSION_RETURN_TYPE_TEXT)
553 num_val = expression_evaluator.ExpressionEvaluator(
554 scored_doc, self._inverted_index, True).ValueOf(
555 sort_spec.sort_expression(), default_value=default_numeric,
556 return_type=search_util.EXPRESSION_RETURN_TYPE_NUMERIC)
557 except expression_evaluator.QueryExpressionEvaluationError, e:
558 raise expression_evaluator.ExpressionEvaluationError(
559 _FAILED_TO_PARSE_SEARCH_REQUEST % (query, e))
560 if isinstance(num_val, datetime.datetime):
561 num_val = search_util.EpochTime(num_val)
564 elif isinstance(text_val, datetime.datetime):
565 num_val = search_util.EpochTime(text_val)
567 if text_val is None:
568 text_val = ''
569 if num_val is None:
570 num_val = 0
571 expr_vals.append([text_val, num_val])
572 return tuple(expr_vals)
574 def SortCmp(x, y):
575 """The comparison function for sort keys."""
578 for i, val_tuple in enumerate(zip(x, y)):
579 cmp_val = cmp(*val_tuple)
580 if cmp_val:
581 if search_params.sort_spec(i).sort_descending():
582 return -cmp_val
583 return cmp_val
584 return 0
585 return sorted(docs, key=SortKey, cmp=SortCmp)
587 def _AttachExpressions(self, docs, search_params):
588 if search_params.has_field_spec():
589 for doc in docs:
590 evaluator = expression_evaluator.ExpressionEvaluator(
591 doc, self._inverted_index)
592 for expr in search_params.field_spec().expression_list():
593 evaluator.Evaluate(expr)
594 return docs
596 def Search(self, search_request):
597 """Searches the simple index for ."""
598 query = urllib.unquote(search_request.query())
599 query = query.strip()
600 score = _ScoreRequested(search_request)
601 if not query:
602 docs = [_ScoredDocument(doc, 0.0) for doc in self._documents.values()]
603 else:
604 if not isinstance(query, unicode):
605 query = unicode(query, 'utf-8')
606 query_tree = query_parser.ParseAndSimplify(query)
607 docs = self._Evaluate(query_tree, score=score)
608 docs = self._Sort(docs, search_request, query, score)
609 docs = self._AttachExpressions(docs, search_request)
610 return docs
612 def GetSchema(self):
613 """Returns the schema for the index."""
614 return self._inverted_index.GetSchema()
616 def __repr__(self):
617 return search_util.Repr(self, [('_index_spec', self._index_spec),
618 ('_documents', self._documents),
619 ('_inverted_index', self._inverted_index)])
622 class SearchServiceStub(apiproxy_stub.APIProxyStub):
623 """Simple RAM backed Search service stub.
625 This stub provides the search_service_pb.SearchService. But this is
626 NOT a subclass of SearchService itself. Services are provided by
627 the methods prefixed by "_Dynamic_".
633 _VERSION = 1
639 _MAX_STORAGE_LIMIT = 1024 * 1024 * 1024
641 def __init__(self, service_name='search', index_file=None):
642 """Constructor.
644 Args:
645 service_name: Service name expected for all calls.
646 index_file: The file to which search indexes will be persisted.
648 self.__indexes = {}
649 self.__index_file = index_file
650 self.__index_file_lock = threading.Lock()
651 super(SearchServiceStub, self).__init__(service_name)
653 self.Read()
655 def _InvalidRequest(self, status, exception):
656 status.set_code(search_service_pb.SearchServiceError.INVALID_REQUEST)
657 status.set_error_detail(exception.message)
659 def _UnknownIndex(self, status, index_spec):
660 status.set_code(search_service_pb.SearchServiceError.OK)
661 status.set_error_detail(
662 "Index '%s' in namespace '%s' does not exist" %
663 (index_spec.name(), index_spec.namespace()))
665 def _GetNamespace(self, namespace):
666 """Get namespace name.
668 Args:
669 namespace: Namespace provided in request arguments.
671 Returns:
672 If namespace is None, returns the name of the current global namespace. If
673 namespace is not None, returns namespace.
675 if namespace is not None:
676 return namespace
677 return namespace_manager.get_namespace()
679 def _GetIndex(self, index_spec, create=False):
680 namespace = self._GetNamespace(index_spec.namespace())
682 index = self.__indexes.setdefault(namespace, {}).get(index_spec.name())
683 if index is None and create:
684 index = SimpleIndex(index_spec)
685 self.__indexes[namespace][index_spec.name()] = index
686 return index
688 def _Dynamic_IndexDocument(self, request, response):
689 """A local implementation of SearchService.IndexDocument RPC.
691 Index a new document or update an existing document.
693 Args:
694 request: A search_service_pb.IndexDocumentRequest.
695 response: An search_service_pb.IndexDocumentResponse.
697 params = request.params()
698 index = self._GetIndex(params.index_spec(), create=True)
699 index.IndexDocuments(params.document_list(), response)
701 def _Dynamic_DeleteDocument(self, request, response):
702 """A local implementation of SearchService.DeleteDocument RPC.
704 Args:
705 request: A search_service_pb.DeleteDocumentRequest.
706 response: An search_service_pb.DeleteDocumentResponse.
708 params = request.params()
709 index_spec = params.index_spec()
710 index = self._GetIndex(index_spec)
711 for document_id in params.doc_id_list():
712 delete_status = response.add_status()
713 if index is None:
714 delete_status.set_code(search_service_pb.SearchServiceError.OK)
715 delete_status.set_error_detail('Not found')
716 else:
717 index.DeleteDocument(document_id, delete_status)
719 def _Dynamic_ListIndexes(self, request, response):
720 """A local implementation of SearchService.ListIndexes RPC.
722 Args:
723 request: A search_service_pb.ListIndexesRequest.
724 response: An search_service_pb.ListIndexesResponse.
726 Raises:
727 ResponseTooLargeError: raised for testing admin console.
732 if request.has_app_id():
733 if random.choice([True] + [False] * 9):
734 raise apiproxy_errors.ResponseTooLargeError()
736 for _ in xrange(random.randint(0, 2) * random.randint(5, 15)):
737 new_index_spec = response.add_index_metadata().mutable_index_spec()
738 new_index_spec.set_name(
739 random.choice(list(_VISIBLE_PRINTABLE_ASCII - set('!'))) +
740 ''.join(random.choice(list(_VISIBLE_PRINTABLE_ASCII))
741 for _ in xrange(random.randint(
742 0, search.MAXIMUM_INDEX_NAME_LENGTH))))
743 response.mutable_status().set_code(
744 random.choice([search_service_pb.SearchServiceError.OK] * 10 +
745 [search_service_pb.SearchServiceError.TRANSIENT_ERROR] +
746 [search_service_pb.SearchServiceError.INTERNAL_ERROR]))
747 return
749 response.mutable_status().set_code(
750 search_service_pb.SearchServiceError.OK)
752 namespace = self._GetNamespace(request.params().namespace())
753 if namespace not in self.__indexes or not self.__indexes[namespace]:
754 return
756 keys, indexes = zip(*sorted(
757 self.__indexes[namespace].iteritems(), key=lambda v: v[0]))
758 position = 0
759 params = request.params()
760 if params.has_start_index_name():
761 position = bisect.bisect_left(keys, params.start_index_name())
762 if (not params.include_start_index() and position < len(keys)
763 and keys[position] == params.start_index_name()):
764 position += 1
765 elif params.has_index_name_prefix():
766 position = bisect.bisect_left(keys, params.index_name_prefix())
767 if params.has_offset():
768 position += params.offset()
769 end_position = position + params.limit()
770 prefix = params.index_name_prefix()
771 for index in indexes[min(position, len(keys)):min(end_position, len(keys))]:
772 index_spec = index.index_spec
773 if prefix and not index_spec.name().startswith(prefix):
774 break
775 metadata = response.add_index_metadata()
776 new_index_spec = metadata.mutable_index_spec()
777 new_index_spec.set_name(index_spec.name())
778 new_index_spec.set_namespace(index_spec.namespace())
779 if params.fetch_schema():
780 self._AddSchemaInformation(index, metadata)
781 self._AddStorageInformation(index, metadata)
783 def _AddSchemaInformation(self, index, metadata_pb):
784 schema = index.GetSchema()
785 for name in schema:
786 field_types = schema[name]
787 new_field_types = metadata_pb.add_field()
788 new_field_types.MergeFrom(field_types)
790 def _AddStorageInformation(self, index, metadata_pb):
791 total_usage = 0
792 for document in index.Documents():
796 for field in document.field_list():
797 total_usage += field.ByteSize()
798 total_usage += len(document.id())
799 storage = metadata_pb.mutable_storage()
800 storage.set_amount_used(total_usage)
801 storage.set_limit(self._MAX_STORAGE_LIMIT)
803 def _AddDocument(self, response, document, ids_only):
804 doc = response.add_document()
805 if ids_only:
806 doc.set_id(document.id())
807 else:
808 doc.MergeFrom(document)
810 def _Dynamic_ListDocuments(self, request, response):
811 """A local implementation of SearchService.ListDocuments RPC.
813 Args:
814 request: A search_service_pb.ListDocumentsRequest.
815 response: An search_service_pb.ListDocumentsResponse.
817 params = request.params()
818 index = self._GetIndex(params.index_spec())
819 if index is None:
820 response.mutable_status().set_code(
821 search_service_pb.SearchServiceError.OK)
822 return
824 num_docs = 0
825 start = not params.has_start_doc_id()
826 for document in sorted(index.Documents(), key=lambda doc: doc.id()):
827 if start:
828 if num_docs < params.limit():
829 self._AddDocument(response, document, params.keys_only())
830 num_docs += 1
831 else:
832 if document.id() >= params.start_doc_id():
833 start = True
834 if (document.id() != params.start_doc_id() or
835 params.include_start_doc()):
836 self._AddDocument(response, document, params.keys_only())
837 num_docs += 1
839 response.mutable_status().set_code(
840 search_service_pb.SearchServiceError.OK)
842 def _RandomSearchResponse(self, request, response):
844 random.seed()
845 if random.random() < 0.03:
846 raise apiproxy_errors.ResponseTooLargeError()
847 response.mutable_status().set_code(
848 random.choice([search_service_pb.SearchServiceError.OK] * 30 +
849 [search_service_pb.SearchServiceError.TRANSIENT_ERROR] +
850 [search_service_pb.SearchServiceError.INTERNAL_ERROR]))
852 params = request.params()
853 random.seed(params.query())
854 total = random.randint(0, 100)
857 if random.random() < 0.3:
858 total = 0
860 offset = 0
861 if params.has_offset():
862 offset = params.offset()
864 remaining = max(0, total - offset)
865 nresults = min(remaining, params.limit())
866 matched_count = offset + nresults
867 if remaining > nresults:
868 matched_count += random.randint(1, 100)
870 def RandomText(charset, min_len, max_len):
871 return ''.join(random.choice(charset)
872 for _ in xrange(random.randint(min_len, max_len)))
874 for i in xrange(nresults):
875 seed = '%s:%s' % (params.query(), i + offset)
876 random.seed(seed)
877 result = response.add_result()
878 doc = result.mutable_document()
879 doc_id = RandomText(string.letters + string.digits, 8, 10)
880 doc.set_id(doc_id)
881 random.seed(doc_id)
882 for _ in params.sort_spec_list():
883 result.add_score(random.random())
885 for name, probability in [('creator', 0.90), ('last_change', 0.40)]:
886 if random.random() < probability:
887 field = doc.add_field()
888 field.set_name(name)
889 value = field.mutable_value()
890 value.set_type(document_pb.FieldValue.TEXT)
891 value.set_string_value(
892 RandomText(string.letters + string.digits, 2, 10)
893 + '@google.com')
895 field = doc.add_field()
896 field.set_name('content')
897 value = field.mutable_value()
898 value.set_type(document_pb.FieldValue.TEXT)
899 value.set_string_value(
900 RandomText(string.printable, 0, 15) + params.query() +
901 RandomText(string.printable + 10 * string.whitespace, 5, 5000))
903 for i in xrange(random.randint(0, 2)):
904 field = doc.add_field()
905 field.set_name(RandomText(string.letters, 3, 7))
906 value = field.mutable_value()
907 value.set_type(document_pb.FieldValue.TEXT)
908 value.set_string_value(RandomText(string.printable, 0, 100))
910 response.set_matched_count(matched_count)
912 def _DefaultFillSearchResponse(self, params, results, response):
913 """Fills the SearchResponse with the first set of results."""
914 position_range = range(0, min(params.limit(), len(results)))
915 self._FillSearchResponse(results, position_range, params.cursor_type(),
916 _ScoreRequested(params), response)
918 def _CopyDocument(self, doc, doc_copy, field_names, ids_only=None):
919 """Copies Document, doc, to doc_copy restricting fields to field_names."""
920 doc_copy.set_id(doc.id())
921 if ids_only:
922 return
923 if doc.has_language():
924 doc_copy.set_language(doc.language())
925 for field in doc.field_list():
926 if not field_names or field.name() in field_names:
927 doc_copy.add_field().CopyFrom(field)
928 doc_copy.set_order_id(doc.order_id())
930 def _FillSearchResponse(self, results, position_range, cursor_type, score,
931 response, field_names=None, ids_only=None):
932 """Fills the SearchResponse with a selection of results."""
933 for i in position_range:
934 result = results[i]
935 search_result = response.add_result()
936 self._CopyDocument(result.document, search_result.mutable_document(),
937 field_names, ids_only)
938 if cursor_type == search_service_pb.SearchParams.PER_RESULT:
939 search_result.set_cursor(self._EncodeCursor(result.document))
940 if score:
941 search_result.add_score(result.score)
942 for field, expression in result.expressions.iteritems():
943 expr = search_result.add_expression()
944 expr.set_name(field)
945 if (isinstance(expression, float) or
946 isinstance(expression, long) or
947 isinstance(expression, int)):
948 expr.mutable_value().set_string_value(str(expression))
949 expr.mutable_value().set_type(document_pb.FieldValue.NUMBER)
950 else:
951 expr.mutable_value().set_string_value(expression)
952 expr.mutable_value().set_type(document_pb.FieldValue.HTML)
954 def _Dynamic_Search(self, request, response):
955 """A local implementation of SearchService.Search RPC.
957 Args:
958 request: A search_service_pb.SearchRequest.
959 response: An search_service_pb.SearchResponse.
961 if request.has_app_id():
962 self._RandomSearchResponse(request, response)
963 return
965 index = self._GetIndex(request.params().index_spec())
966 if index is None:
967 self._UnknownIndex(response.mutable_status(),
968 request.params().index_spec())
969 response.set_matched_count(0)
970 return
972 params = request.params()
973 try:
974 results = index.Search(params)
975 except query_parser.QueryException, e:
976 self._InvalidRequest(response.mutable_status(), e)
977 response.set_matched_count(0)
978 return
979 except expression_evaluator.ExpressionEvaluationError, e:
980 self._InvalidRequest(response.mutable_status(), e)
981 response.set_matched_count(0)
982 return
983 except document_matcher.ExpressionTreeException, e:
984 self._InvalidRequest(response.mutable_status(), e)
985 response.set_matched_count(0)
986 return
987 response.set_matched_count(len(results))
989 offset = 0
990 if params.has_cursor():
991 try:
992 doc_id = self._DecodeCursor(params.cursor())
993 except _InvalidCursorException, e:
994 self._InvalidRequest(response.mutable_status(), e)
995 response.set_matched_count(0)
996 return
997 for i, result in enumerate(results):
998 if result.document.id() == doc_id:
999 offset = i + 1
1000 break
1001 elif params.has_offset():
1002 offset = params.offset()
1006 if offset < len(results):
1009 limit = offset + params.limit()
1010 if limit >= len(results):
1013 range_end = len(results)
1014 else:
1018 range_end = limit
1019 if params.cursor_type() == search_service_pb.SearchParams.SINGLE:
1020 document = results[range_end - 1].document
1021 response.set_cursor(self._EncodeCursor(document))
1022 result_range = range(offset, range_end)
1023 else:
1024 result_range = range(0)
1025 field_names = params.field_spec().name_list()
1026 self._FillSearchResponse(results, result_range, params.cursor_type(),
1027 _ScoreRequested(params), response, field_names,
1028 params.keys_only())
1030 response.mutable_status().set_code(search_service_pb.SearchServiceError.OK)
1032 def _EncodeCursor(self, document):
1033 doc_id_hash = hashlib.sha224(document.id()).hexdigest()
1034 cursor = doc_id_hash + '|' + document.id()
1035 return base64.urlsafe_b64encode(cursor)
1037 def _DecodeCursor(self, encoded_cursor):
1038 cursor = base64.urlsafe_b64decode(encoded_cursor)
1039 separator = cursor.find('|')
1040 if separator < 0:
1041 raise _InvalidCursorException('Invalid cursor string: ' + encoded_cursor)
1042 doc_id_hash = cursor[:separator]
1043 doc_id = cursor[separator+1:]
1044 if hashlib.sha224(doc_id).hexdigest() == doc_id_hash:
1045 return doc_id
1046 raise _InvalidCursorException('Invalid cursor string: ' + encoded_cursor)
1048 def __repr__(self):
1049 return search_util.Repr(self, [('__indexes', self.__indexes)])
1051 def Write(self):
1052 """Write search indexes to the index file.
1054 This method is a no-op if index_file is set to None.
1056 if not self.__index_file:
1057 return
1063 descriptor, tmp_filename = tempfile.mkstemp(
1064 dir=os.path.dirname(self.__index_file))
1065 tmpfile = os.fdopen(descriptor, 'wb')
1067 pickler = pickle.Pickler(tmpfile, protocol=1)
1068 pickler.fast = True
1069 pickler.dump((self._VERSION, self.__indexes))
1071 tmpfile.close()
1073 self.__index_file_lock.acquire()
1074 try:
1075 try:
1077 os.rename(tmp_filename, self.__index_file)
1078 except OSError:
1081 os.remove(self.__index_file)
1082 os.rename(tmp_filename, self.__index_file)
1083 finally:
1084 self.__index_file_lock.release()
1086 def _ReadFromFile(self):
1087 self.__index_file_lock.acquire()
1088 try:
1089 if os.path.isfile(self.__index_file):
1090 version, indexes = pickle.load(open(self.__index_file, 'rb'))
1091 if version == self._VERSION:
1092 return indexes
1093 logging.warning(
1094 'Saved search indexes are not compatible with this version of the '
1095 'SDK. Search indexes have been cleared.')
1096 else:
1097 logging.warning(
1098 'Could not read search indexes from %s', self.__index_file)
1099 except (AttributeError, LookupError, ImportError, NameError, TypeError,
1100 ValueError, pickle.PickleError, IOError), e:
1101 logging.warning(
1102 'Could not read indexes from %s. Try running with the '
1103 '--clear_search_index flag. Cause:\n%r' % (self.__index_file, e))
1104 finally:
1105 self.__index_file_lock.release()
1107 return {}
1109 def Read(self):
1110 """Read search indexes from the index file.
1112 This method is a no-op if index_file is set to None.
1114 if not self.__index_file:
1115 return
1116 read_indexes = self._ReadFromFile()
1117 if read_indexes:
1118 self.__indexes = read_indexes