python/google/appengine/api/search/simple_search_stub.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright 2007 Google Inc.
   4 #
   5 # Licensed under the Apache License, Version 2.0 (the "License");
   6 # you may not use this file except in compliance with the License.
   7 # You may obtain a copy of the License at
   8 #
   9 #     http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing, software
  12 # distributed under the License is distributed on an "AS IS" BASIS,
  13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 # See the License for the specific language governing permissions and
  15 # limitations under the License.
  16 #
  17
  18
  19
  20
  21 """Simple RAM backed Search API stub."""
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32 import base64
  33 import bisect
  34 import copy
  35 import cPickle as pickle
  36 import datetime
  37 import functools
  38 import logging
  39 import math
  40 import os
  41 import random
  42 import string
  43 import tempfile
  44 import threading
  45 import urllib
  46 import uuid
  47
  48 from google.appengine.datastore import document_pb
  49 from google.appengine.api import apiproxy_stub
  50 from google.appengine.api.namespace_manager import namespace_manager
  51 from google.appengine.api.search import query_parser
  52 from google.appengine.api.search import QueryParser
  53 from google.appengine.api.search import search
  54 from google.appengine.api.search import search_service_pb
  55 from google.appengine.api.search import search_util
  56 from google.appengine.api.search.stub import document_matcher
  57 from google.appengine.api.search.stub import expression_evaluator
  58 from google.appengine.api.search.stub import simple_tokenizer
  59 from google.appengine.api.search.stub import tokens
  60 from google.appengine.runtime import apiproxy_errors
  61
  62 __all__ = ['IndexConsistencyError',
  63            'Posting',
  64            'PostingList',
  65            'RamInvertedIndex',
  66            'SearchServiceStub',
  67            'SimpleIndex',
  68            'FieldTypesDict',
  69           ]
  70
  71 _VISIBLE_PRINTABLE_ASCII = frozenset(
  72     set(string.printable) - set(string.whitespace))
  73
  74
  75 class IndexConsistencyError(Exception):
  76   """Deprecated 1.7.7. Accessed index with same name different consistency."""
  77
  78
  79 class Posting(object):
  80   """Represents a occurrences of some token at positions in a document."""
  81
  82   def __init__(self, doc_id):
  83     """Initializer.
  84
  85     Args:
  86       doc_id: The identifier of the document with token occurrences.
  87
  88     Raises:
  89       TypeError: If an unknown argument is passed.
  90     """
  91     self._doc_id = doc_id
  92     self._positions = []
  93
  94   @property
  95   def doc_id(self):
  96     """Return id of the document that the token occurred in."""
  97     return self._doc_id
  98
  99   def AddPosition(self, position):
 100     """Adds the position in token sequence to occurrences for token."""
 101     pos = bisect.bisect_left(self._positions, position)
 102     if pos < len(self._positions) and self._positions[pos] == position:
 103       return
 104     self._positions.insert(pos, position)
 105
 106   def RemovePosition(self, position):
 107     """Removes the position in token sequence from occurrences for token."""
 108     pos = bisect.bisect_left(self._positions, position)
 109     if pos < len(self._positions) and self._positions[pos] == position:
 110       del self._positions[pos]
 111
 112   def __cmp__(self, other):
 113     if not isinstance(other, Posting):
 114       return -2
 115     return cmp(self.doc_id, other.doc_id)
 116
 117   @property
 118   def positions(self):
 119     return self._positions
 120
 121   def __repr__(self):
 122     return search_util.Repr(
 123         self, [('doc_id', self.doc_id), ('positions', self.positions)])
 124
 125
 126 class PostingList(object):
 127   """Represents ordered positions of some token in document.
 128
 129   A PostingList consists of a document id and a sequence of positions
 130   that the same token occurs in the document.
 131   """
 132
 133   def __init__(self):
 134     self._postings = []
 135
 136   def Add(self, doc_id, position):
 137     """Adds the token position for the given doc_id."""
 138     posting = Posting(doc_id=doc_id)
 139     pos = bisect.bisect_left(self._postings, posting)
 140     if pos < len(self._postings) and self._postings[
 141         pos].doc_id == posting.doc_id:
 142       posting = self._postings[pos]
 143     else:
 144       self._postings.insert(pos, posting)
 145     posting.AddPosition(position)
 146
 147   def Remove(self, doc_id, position):
 148     """Removes the token position for the given doc_id."""
 149     posting = Posting(doc_id=doc_id)
 150     pos = bisect.bisect_left(self._postings, posting)
 151     if pos < len(self._postings) and self._postings[
 152         pos].doc_id == posting.doc_id:
 153       posting = self._postings[pos]
 154       posting.RemovePosition(position)
 155       if not posting.positions:
 156         del self._postings[pos]
 157
 158   @property
 159   def postings(self):
 160     return self._postings
 161
 162   def __iter__(self):
 163     return iter(self._postings)
 164
 165   def __repr__(self):
 166     return search_util.Repr(self, [('postings', self.postings)])
 167
 168
 169 class _ScoredDocument(object):
 170   """A scored document_pb.Document."""
 171
 172   def __init__(self, document, score):
 173     self._document = document
 174     self._score = score
 175     self._expressions = {}
 176
 177   @property
 178   def document(self):
 179     return self._document
 180
 181   @property
 182   def score(self):
 183     return self._score
 184
 185   @property
 186   def expressions(self):
 187     return self._expressions
 188
 189   def __repr__(self):
 190     return search_util.Repr(
 191         self, [('document', self.document), ('score', self.score)])
 192
 193
 194 class _DocumentStatistics(object):
 195   """Statistics about terms occuring in a document."""
 196
 197   def __init__(self):
 198     self._term_stats = {}
 199
 200   def __iter__(self):
 201     for item in self._term_stats.items():
 202       yield item
 203
 204   def IncrementTermCount(self, term):
 205     """Adds an occurrence of the term to the stats for the document."""
 206     count = 0
 207     if term in self._term_stats:
 208       count = self._term_stats[term]
 209     count += 1
 210     self._term_stats[term] = count
 211
 212   def TermFrequency(self, term):
 213     """Returns the term frequency in the document."""
 214     if term not in self._term_stats:
 215       return 0
 216     return self._term_stats[term]
 217
 218   @property
 219   def term_stats(self):
 220     """Returns the collection of term frequencies in the document."""
 221     return self._term_stats
 222
 223   def __eq__(self, other):
 224     return self.term_stats == other.term_stats
 225
 226   def __hash__(self):
 227     return hash(self.term_stats)
 228
 229   def __repr__(self):
 230     return search_util.Repr(self, [('term_stats', self.term_stats)])
 231
 232
 233 class FieldTypesDict(object):
 234   """Dictionary-like object for type mappings."""
 235
 236   def __init__(self):
 237     self._field_types = []
 238
 239   def __contains__(self, name):
 240     return name in [ f.name() for f in self._field_types ]
 241
 242   def __getitem__(self, name):
 243     for f in self._field_types:
 244       if name == f.name():
 245         return f
 246     raise KeyError, name
 247
 248   def AddFieldType(self, name, field_type):
 249     field_types = None
 250     for f in self._field_types:
 251       if name == f.name():
 252         field_types = f
 253     if field_types is None:
 254       field_types = document_pb.FieldTypes()
 255       field_types.set_name(name)
 256       self._field_types.append(field_types)
 257     if field_type not in field_types.type_list():
 258       field_types.add_type(field_type)
 259
 260   def __iter__(self):
 261     return iter(sorted([f.name() for f in self._field_types]))
 262
 263   def __repr__(self):
 264     return repr(self._field_types)
 265
 266 class RamInvertedIndex(object):
 267   """A simple RAM-resident inverted file over documents."""
 268
 269   def __init__(self, tokenizer):
 270     self._tokenizer = tokenizer
 271     self._inverted_index = {}
 272     self._schema = FieldTypesDict()
 273     self._document_ids = set([])
 274
 275   def _AddDocumentId(self, doc_id):
 276     """Adds the doc_id to set in index."""
 277     self._document_ids.add(doc_id)
 278
 279   def _RemoveDocumentId(self, doc_id):
 280     """Removes the doc_id from the set in index."""
 281     if doc_id in self._document_ids:
 282       self._document_ids.remove(doc_id)
 283
 284   @property
 285   def document_count(self):
 286     return len(self._document_ids)
 287
 288   def _AddFieldType(self, name, field_type):
 289     """Adds the type to the list supported for a named field."""
 290     self._schema.AddFieldType(name, field_type)
 291
 292   def GetDocumentStats(self, document):
 293     """Gets statistics about occurrences of terms in document."""
 294     document_stats = _DocumentStatistics()
 295     for field in document.field_list():
 296       for token in self._tokenizer.TokenizeValue(field_value=field.value()):
 297         document_stats.IncrementTermCount(token.chars)
 298     return document_stats
 299
 300   def AddDocument(self, doc_id, document):
 301     """Adds a document into the index."""
 302     token_position = 0
 303     for field in document.field_list():
 304       self._AddFieldType(field.name(), field.value().type())
 305       self._AddTokens(doc_id, field.name(), field.value(), token_position)
 306     self._AddDocumentId(doc_id)
 307
 308   def RemoveDocument(self, document):
 309     """Removes a document from the index."""
 310     doc_id = document.id()
 311     for field in document.field_list():
 312       self._RemoveTokens(doc_id, field.name(), field.value())
 313     self._RemoveDocumentId(doc_id)
 314
 315   def _AddTokens(self, doc_id, field_name, field_value, token_position):
 316     """Adds token occurrences for a given doc's field value."""
 317     for token in self._tokenizer.TokenizeValue(field_value, token_position):
 318       self._AddToken(doc_id, token)
 319       self._AddToken(doc_id, token.RestrictField(field_name))
 320
 321   def _RemoveTokens(self, doc_id, field_name, field_value):
 322     """Removes tokens occurrences for a given doc's field value."""
 323     for token in self._tokenizer.TokenizeValue(field_value=field_value):
 324       self._RemoveToken(doc_id, token)
 325       self._RemoveToken(doc_id, token.RestrictField(field_name))
 326
 327   def _AddToken(self, doc_id, token):
 328     """Adds a token occurrence for a document."""
 329     postings = self._inverted_index.get(token)
 330     if postings is None:
 331       self._inverted_index[token] = postings = PostingList()
 332     postings.Add(doc_id, token.position)
 333
 334   def _RemoveToken(self, doc_id, token):
 335     """Removes a token occurrence for a document."""
 336     if token in self._inverted_index:
 337       postings = self._inverted_index[token]
 338       postings.Remove(doc_id, token.position)
 339       if not postings.postings:
 340         del self._inverted_index[token]
 341
 342   def GetPostingsForToken(self, token):
 343     """Returns all document postings which for the token."""
 344     if token in self._inverted_index:
 345       return self._inverted_index[token].postings
 346     return []
 347
 348   def GetSchema(self):
 349     """Returns the schema for the index."""
 350     return self._schema
 351
 352   def __repr__(self):
 353     return search_util.Repr(self, [('_inverted_index', self._inverted_index),
 354                                    ('_schema', self._schema),
 355                                    ('document_count', self.document_count)])
 356
 357
 358 def _ScoreRequested(params):
 359   """Returns True if match scoring requested, False otherwise."""
 360   return params.has_scorer_spec() and params.scorer_spec().has_scorer()
 361
 362
 363 class SimpleIndex(object):
 364   """A simple search service which uses a RAM-resident inverted file."""
 365
 366   def __init__(self, index_spec):
 367     self._index_spec = index_spec
 368     self._documents = {}
 369     self._parser = simple_tokenizer.SimpleTokenizer(split_restricts=False)
 370     self._inverted_index = RamInvertedIndex(simple_tokenizer.SimpleTokenizer())
 371
 372   @property
 373   def index_spec(self):
 374     """Returns the index specification for the index."""
 375     return self._index_spec
 376
 377   def IndexDocuments(self, documents, response):
 378     """Indexes an iterable DocumentPb.Document."""
 379     for document in documents:
 380       doc_id = document.id()
 381       if not doc_id:
 382         doc_id = str(uuid.uuid4())
 383         document.set_id(doc_id)
 384
 385
 386
 387
 388
 389
 390
 391       try:
 392         search._NewDocumentFromPb(document)
 393       except ValueError, e:
 394         new_status = response.add_status()
 395         new_status.set_code(search_service_pb.SearchServiceError.INVALID_REQUEST)
 396         new_status.set_error_detail(e.message)
 397         continue
 398       response.add_doc_id(doc_id)
 399       if doc_id in self._documents:
 400         old_document = self._documents[doc_id]
 401         self._inverted_index.RemoveDocument(old_document)
 402       self._documents[doc_id] = document
 403       new_status = response.add_status()
 404       new_status.set_code(search_service_pb.SearchServiceError.OK)
 405       self._inverted_index.AddDocument(doc_id, document)
 406
 407   def DeleteDocuments(self, document_ids, response):
 408     """Deletes documents for the given document_ids."""
 409     for document_id in document_ids:
 410       if document_id in self._documents:
 411         document = self._documents[document_id]
 412         self._inverted_index.RemoveDocument(document)
 413         del self._documents[document_id]
 414       delete_status = response.add_status()
 415       delete_status.set_code(search_service_pb.SearchServiceError.OK)
 416
 417   def Documents(self):
 418     """Returns the documents in the index."""
 419     return self._documents.values()
 420
 421   def _TermFrequency(self, term, document):
 422     """Return the term frequency in the document."""
 423     return self._inverted_index.GetDocumentStats(document).TermFrequency(term)
 424
 425   @property
 426   def document_count(self):
 427     """Returns the count of documents in the index."""
 428     return self._inverted_index.document_count
 429
 430   def _DocumentCountForTerm(self, term):
 431     """Returns the document count for documents containing the term."""
 432     return len(self._PostingsForToken(tokens.Token(chars=term)))
 433
 434   def _InverseDocumentFrequency(self, term):
 435     """Returns inverse document frequency of term."""
 436     doc_count = self._DocumentCountForTerm(term)
 437     if doc_count:
 438       return math.log10(self.document_count / float(doc_count))
 439     else:
 440       return 0
 441
 442   def _TermFrequencyInverseDocumentFrequency(self, term, document):
 443     """Returns the term frequency times inverse document frequency of term."""
 444     return (self._TermFrequency(term, document) *
 445             self._InverseDocumentFrequency(term))
 446
 447   def _ScoreDocument(self, document, score, terms):
 448     """Scores a document for the given query."""
 449     if not score:
 450       return 0
 451     tf_idf = 0
 452     for term in terms:
 453       tf_idf += self._TermFrequencyInverseDocumentFrequency(term, document)
 454     return tf_idf
 455
 456   def _PostingsForToken(self, token):
 457     """Returns the postings for the token."""
 458     return self._inverted_index.GetPostingsForToken(token)
 459
 460   def _CollectTerms(self, node):
 461     """Get all search terms for scoring."""
 462     if node.getType() in search_util.TEXT_QUERY_TYPES:
 463       return set([query_parser.GetQueryNodeText(node).strip('"')])
 464     elif node.children:
 465       if node.getType() == QueryParser.EQ and len(node.children) > 1:
 466         children = node.children[1:]
 467       else:
 468         children = node.children
 469
 470       result = set()
 471       for term_set in (self._CollectTerms(child) for child in children):
 472         result.update(term_set)
 473       return result
 474     return set()
 475
 476   def _CollectFields(self, node):
 477     if node.getType() == QueryParser.EQ and node.children:
 478       return set([query_parser.GetQueryNodeText(node.children[0])])
 479     elif node.children:
 480       result = set()
 481       for term_set in (self._CollectFields(child) for child in node.children):
 482         result.update(term_set)
 483       return result
 484     return set()
 485
 486   def _Evaluate(self, node, score=True):
 487     """Retrieve scored results for a search query."""
 488     doc_match = document_matcher.DocumentMatcher(node, self._inverted_index)
 489
 490     matched_documents = doc_match.FilterDocuments(self._documents.itervalues())
 491     terms = self._CollectTerms(node)
 492     scored_documents = [
 493         _ScoredDocument(doc, self._ScoreDocument(doc, score, terms))
 494         for doc in matched_documents]
 495     return scored_documents
 496
 497   def _Sort(self, docs, search_params, score):
 498     if score:
 499       return sorted(docs, key=lambda doc: doc.score, reverse=True)
 500
 501     if not search_params.sort_spec_size():
 502       return sorted(docs, key=lambda doc: doc.document.order_id(), reverse=True)
 503
 504     def SortKey(scored_doc):
 505       """Return the sort key for a document based on the request parameters.
 506
 507       Arguments:
 508         scored_doc: The document to score
 509
 510       Returns:
 511         The sort key of a document. The sort key is a tuple, where the nth
 512         element in the tuple corresponds to the value of the nth sort expression
 513         evaluated on the document.
 514
 515       Raises:
 516         Exception: if no default value is specified.
 517       """
 518       expr_vals = []
 519       for sort_spec in search_params.sort_spec_list():
 520         if not (sort_spec.has_default_value_text() or
 521                 sort_spec.has_default_value_numeric()):
 522           raise Exception('A default value must be specified for sorting.')
 523         elif sort_spec.has_default_value_text():
 524           default_value = sort_spec.default_value_text()
 525         else:
 526           default_value = sort_spec.default_value_numeric()
 527         val = expression_evaluator.ExpressionEvaluator(
 528             scored_doc, self._inverted_index, True).ValueOf(
 529                 sort_spec.sort_expression(), default_value=default_value)
 530         if isinstance(val, datetime.datetime):
 531           val = search_util.EpochTime(val)
 532         expr_vals.append(val)
 533       return tuple(expr_vals)
 534
 535     def SortCmp(x, y):
 536       """The comparison function for sort keys."""
 537
 538
 539       for i, val_tuple in enumerate(zip(x, y)):
 540         cmp_val = cmp(*val_tuple)
 541         if cmp_val:
 542           if search_params.sort_spec(i).sort_descending():
 543             return -cmp_val
 544           return cmp_val
 545       return 0
 546     return sorted(docs, key=SortKey, cmp=SortCmp)
 547
 548   def _AttachExpressions(self, docs, search_params):
 549     if search_params.has_field_spec():
 550       for doc in docs:
 551         evaluator = expression_evaluator.ExpressionEvaluator(
 552             doc, self._inverted_index)
 553         for expr in search_params.field_spec().expression_list():
 554           evaluator.Evaluate(expr)
 555     return docs
 556
 557   def Search(self, search_request):
 558     """Searches the simple index for ."""
 559     query = urllib.unquote(search_request.query())
 560     query = query.strip()
 561     score = _ScoreRequested(search_request)
 562     if not query:
 563       docs = [_ScoredDocument(doc, 0.0) for doc in self._documents.values()]
 564     else:
 565       if not isinstance(query, unicode):
 566         query = unicode(query, 'utf-8')
 567       query_tree = query_parser.ParseAndSimplify(query)
 568       docs = self._Evaluate(query_tree, score=score)
 569     docs = self._Sort(docs, search_request, score)
 570     docs = self._AttachExpressions(docs, search_request)
 571     return docs
 572
 573   def GetSchema(self):
 574     """Returns the schema for the index."""
 575     return self._inverted_index.GetSchema()
 576
 577   def __repr__(self):
 578     return search_util.Repr(self, [('_index_spec', self._index_spec),
 579                                    ('_documents', self._documents),
 580                                    ('_inverted_index', self._inverted_index)])
 581
 582
 583 class SearchServiceStub(apiproxy_stub.APIProxyStub):
 584   """Simple RAM backed Search service stub.
 585
 586   This stub provides the search_service_pb.SearchService.  But this is
 587   NOT a subclass of SearchService itself.  Services are provided by
 588   the methods prefixed by "_Dynamic_".
 589   """
 590
 591
 592
 593
 594   _VERSION = 1
 595
 596   def __init__(self, service_name='search', index_file=None):
 597     """Constructor.
 598
 599     Args:
 600       service_name: Service name expected for all calls.
 601       index_file: The file to which search indexes will be persisted.
 602     """
 603     self.__indexes = {}
 604     self.__index_file = index_file
 605     self.__index_file_lock = threading.Lock()
 606     super(SearchServiceStub, self).__init__(service_name)
 607
 608     self.Read()
 609
 610   def _InvalidRequest(self, status, exception):
 611     status.set_code(search_service_pb.SearchServiceError.INVALID_REQUEST)
 612     status.set_error_detail(exception.message)
 613
 614   def _UnknownIndex(self, status, index_spec):
 615     status.set_code(search_service_pb.SearchServiceError.OK)
 616     status.set_error_detail('no index for %r' % index_spec)
 617
 618   def _GetNamespace(self, namespace):
 619     """Get namespace name.
 620
 621     Args:
 622       namespace: Namespace provided in request arguments.
 623
 624     Returns:
 625       If namespace is None, returns the name of the current global namespace. If
 626       namespace is not None, returns namespace.
 627     """
 628     if namespace is not None:
 629       return namespace
 630     return namespace_manager.get_namespace()
 631
 632   def _GetIndex(self, index_spec, create=False):
 633     namespace = self._GetNamespace(index_spec.namespace())
 634
 635     index = self.__indexes.setdefault(namespace, {}).get(index_spec.name())
 636     if index is None:
 637       if create:
 638         index = SimpleIndex(index_spec)
 639         self.__indexes[namespace][index_spec.name()] = index
 640       else:
 641         return None
 642     return index
 643
 644   def _Dynamic_IndexDocument(self, request, response):
 645     """A local implementation of SearchService.IndexDocument RPC.
 646
 647     Index a new document or update an existing document.
 648
 649     Args:
 650       request: A search_service_pb.IndexDocumentRequest.
 651       response: An search_service_pb.IndexDocumentResponse.
 652     """
 653     params = request.params()
 654     index = self._GetIndex(params.index_spec(), create=True)
 655     index.IndexDocuments(params.document_list(), response)
 656
 657   def _Dynamic_DeleteDocument(self, request, response):
 658     """A local implementation of SearchService.DeleteDocument RPC.
 659
 660     Args:
 661       request: A search_service_pb.DeleteDocumentRequest.
 662       response: An search_service_pb.DeleteDocumentResponse.
 663     """
 664     params = request.params()
 665     index_spec = params.index_spec()
 666     index = self._GetIndex(index_spec)
 667     if index is None:
 668       self._UnknownIndex(response.add_status(), index_spec)
 669       return
 670     index.DeleteDocuments(params.doc_id_list(), response)
 671
 672   def _Dynamic_ListIndexes(self, request, response):
 673     """A local implementation of SearchService.ListIndexes RPC.
 674
 675     Args:
 676       request: A search_service_pb.ListIndexesRequest.
 677       response: An search_service_pb.ListIndexesResponse.
 678
 679     Raises:
 680       ResponseTooLargeError: raised for testing admin console.
 681     """
 682
 683
 684
 685     if request.has_app_id():
 686       if random.choice([True] + [False] * 9):
 687         raise apiproxy_errors.ResponseTooLargeError()
 688
 689       for _ in xrange(random.randint(0, 2) * random.randint(5, 15)):
 690         new_index_spec = response.add_index_metadata().mutable_index_spec()
 691         new_index_spec.set_name(
 692             random.choice(list(_VISIBLE_PRINTABLE_ASCII - set('!'))) +
 693             ''.join(random.choice(list(_VISIBLE_PRINTABLE_ASCII))
 694                     for _ in xrange(random.randint(
 695                         0, search.MAXIMUM_INDEX_NAME_LENGTH))))
 696       response.mutable_status().set_code(
 697           random.choice([search_service_pb.SearchServiceError.OK] * 10 +
 698                         [search_service_pb.SearchServiceError.TRANSIENT_ERROR] +
 699                         [search_service_pb.SearchServiceError.INTERNAL_ERROR]))
 700       return
 701
 702     response.mutable_status().set_code(
 703         search_service_pb.SearchServiceError.OK)
 704
 705     namespace = self._GetNamespace(request.params().namespace())
 706     if namespace not in self.__indexes or not self.__indexes[namespace]:
 707       return
 708
 709     keys, indexes = zip(*sorted(
 710         self.__indexes[namespace].iteritems(), key=lambda v: v[0]))
 711     position = 0
 712     params = request.params()
 713     if params.has_start_index_name():
 714       position = bisect.bisect_left(keys, params.start_index_name())
 715       if (not params.include_start_index() and position < len(keys)
 716           and keys[position] == params.start_index_name()):
 717         position += 1
 718     elif params.has_index_name_prefix():
 719       position = bisect.bisect_left(keys, params.index_name_prefix())
 720     if params.has_offset():
 721       position += params.offset()
 722     end_position = position + params.limit()
 723     prefix = params.index_name_prefix()
 724     for index in indexes[min(position, len(keys)):min(end_position, len(keys))]:
 725       index_spec = index.index_spec
 726       if prefix and not index_spec.name().startswith(prefix):
 727         break
 728       metadata = response.add_index_metadata()
 729       new_index_spec = metadata.mutable_index_spec()
 730       new_index_spec.set_name(index_spec.name())
 731       new_index_spec.set_namespace(index_spec.namespace())
 732       if params.fetch_schema():
 733         self._AddSchemaInformation(index, metadata)
 734
 735   def _AddSchemaInformation(self, index, metadata_pb):
 736     schema = index.GetSchema()
 737     for name in schema:
 738       field_types = schema[name]
 739       new_field_types = metadata_pb.add_field()
 740       new_field_types.MergeFrom(field_types)
 741
 742   def _AddDocument(self, response, document, ids_only):
 743     doc = response.add_document()
 744     if ids_only:
 745       doc.set_id(document.id())
 746     else:
 747       doc.MergeFrom(document)
 748
 749   def _Dynamic_ListDocuments(self, request, response):
 750     """A local implementation of SearchService.ListDocuments RPC.
 751
 752     Args:
 753       request: A search_service_pb.ListDocumentsRequest.
 754       response: An search_service_pb.ListDocumentsResponse.
 755     """
 756     params = request.params()
 757     index = self._GetIndex(params.index_spec(), create=True)
 758     if index is None:
 759       self._UnknownIndex(response.mutable_status(), params.index_spec())
 760       return
 761
 762     num_docs = 0
 763     start = not params.has_start_doc_id()
 764     for document in sorted(index.Documents(), key=lambda doc: doc.id()):
 765       if start:
 766         if num_docs < params.limit():
 767           self._AddDocument(response, document, params.keys_only())
 768           num_docs += 1
 769       else:
 770         if document.id() >= params.start_doc_id():
 771           start = True
 772           if (document.id() != params.start_doc_id() or
 773               params.include_start_doc()):
 774             self._AddDocument(response, document, params.keys_only())
 775             num_docs += 1
 776
 777     response.mutable_status().set_code(
 778         search_service_pb.SearchServiceError.OK)
 779
 780   def _RandomSearchResponse(self, request, response):
 781
 782     random.seed()
 783     if random.random() < 0.03:
 784       raise apiproxy_errors.ResponseTooLargeError()
 785     response.mutable_status().set_code(
 786         random.choice([search_service_pb.SearchServiceError.OK] * 30 +
 787                       [search_service_pb.SearchServiceError.TRANSIENT_ERROR] +
 788                       [search_service_pb.SearchServiceError.INTERNAL_ERROR]))
 789
 790     params = request.params()
 791     random.seed(params.query())
 792     total = random.randint(0, 100)
 793
 794
 795     if random.random() < 0.3:
 796       total = 0
 797
 798     offset = 0
 799     if params.has_offset():
 800       offset = params.offset()
 801
 802     remaining = max(0, total - offset)
 803     nresults = min(remaining, params.limit())
 804     matched_count = offset + nresults
 805     if remaining > nresults:
 806       matched_count += random.randint(1, 100)
 807
 808     def RandomText(charset, min_len, max_len):
 809       return ''.join(random.choice(charset)
 810                      for _ in xrange(random.randint(min_len, max_len)))
 811
 812     for i in xrange(nresults):
 813       seed = '%s:%s' % (params.query(), i + offset)
 814       random.seed(seed)
 815       result = response.add_result()
 816       doc = result.mutable_document()
 817       doc_id = RandomText(string.letters + string.digits, 8, 10)
 818       doc.set_id(doc_id)
 819       random.seed(doc_id)
 820       for _ in params.sort_spec_list():
 821         result.add_score(random.random())
 822
 823       for name, probability in [('creator', 0.90), ('last_change', 0.40)]:
 824         if random.random() < probability:
 825           field = doc.add_field()
 826           field.set_name(name)
 827           value = field.mutable_value()
 828           value.set_type(document_pb.FieldValue.TEXT)
 829           value.set_string_value(
 830               RandomText(string.letters + string.digits, 2, 10)
 831               + '@google.com')
 832
 833       field = doc.add_field()
 834       field.set_name('content')
 835       value = field.mutable_value()
 836       value.set_type(document_pb.FieldValue.TEXT)
 837       value.set_string_value(
 838           RandomText(string.printable, 0, 15) + params.query() +
 839           RandomText(string.printable + 10 * string.whitespace, 5, 5000))
 840
 841       for i in xrange(random.randint(0, 2)):
 842         field = doc.add_field()
 843         field.set_name(RandomText(string.letters, 3, 7))
 844         value = field.mutable_value()
 845         value.set_type(document_pb.FieldValue.TEXT)
 846         value.set_string_value(RandomText(string.printable, 0, 100))
 847
 848     response.set_matched_count(matched_count)
 849
 850   def _DefaultFillSearchResponse(self, params, results, response):
 851     """Fills the SearchResponse with the first set of results."""
 852     position_range = range(0, min(params.limit(), len(results)))
 853     self._FillSearchResponse(results, position_range, params.cursor_type(),
 854                              _ScoreRequested(params), response)
 855
 856   def _CopyDocument(self, doc, doc_copy, field_names, ids_only=None):
 857     """Copies Document, doc, to doc_copy restricting fields to field_names."""
 858     doc_copy.set_id(doc.id())
 859     if ids_only:
 860       return
 861     if doc.has_language():
 862       doc_copy.set_language(doc.language())
 863     for field in doc.field_list():
 864       if not field_names or field.name() in field_names:
 865         doc_copy.add_field().CopyFrom(field)
 866     doc_copy.set_order_id(doc.order_id())
 867
 868   def _FillSearchResponse(self, results, position_range, cursor_type, score,
 869                           response, field_names=None, ids_only=None):
 870     """Fills the SearchResponse with a selection of results."""
 871     for i in position_range:
 872       result = results[i]
 873       search_result = response.add_result()
 874       self._CopyDocument(result.document, search_result.mutable_document(),
 875                          field_names, ids_only)
 876       if cursor_type == search_service_pb.SearchParams.PER_RESULT:
 877         search_result.set_cursor(self._EncodeCursor(result.document))
 878       if score:
 879         search_result.add_score(result.score)
 880       for field, expression in result.expressions.iteritems():
 881         expr = search_result.add_expression()
 882         expr.set_name(field)
 883         if (isinstance(expression, float) or
 884             isinstance(expression, long) or
 885             isinstance(expression, int)):
 886           expr.mutable_value().set_string_value(str(expression))
 887           expr.mutable_value().set_type(document_pb.FieldValue.NUMBER)
 888         else:
 889           expr.mutable_value().set_string_value(expression)
 890           expr.mutable_value().set_type(document_pb.FieldValue.HTML)
 891
 892   def _Dynamic_Search(self, request, response):
 893     """A local implementation of SearchService.Search RPC.
 894
 895     Args:
 896       request: A search_service_pb.SearchRequest.
 897       response: An search_service_pb.SearchResponse.
 898     """
 899     if request.has_app_id():
 900       self._RandomSearchResponse(request, response)
 901       return
 902
 903     index = None
 904     index = self._GetIndex(request.params().index_spec())
 905     if index is None:
 906       self._UnknownIndex(response.mutable_status(),
 907                          request.params().index_spec())
 908       response.set_matched_count(0)
 909       return
 910
 911     params = request.params()
 912     try:
 913       results = index.Search(params)
 914     except query_parser.QueryException, e:
 915       self._InvalidRequest(response.mutable_status(), e)
 916       response.set_matched_count(0)
 917       return
 918     except document_matcher.ExpressionTreeException, e:
 919       self._InvalidRequest(response.mutable_status(), e)
 920       response.set_matched_count(0)
 921       return
 922     response.set_matched_count(len(results))
 923
 924     offset = 0
 925     if params.has_cursor():
 926       doc_id = self._DecodeCursor(params.cursor())
 927       for i, result in enumerate(results):
 928         if result.document.id() == doc_id:
 929           offset = i + 1
 930           break
 931     elif params.has_offset():
 932       offset = params.offset()
 933
 934
 935
 936     if offset < len(results):
 937
 938
 939       limit = offset + params.limit()
 940       if limit >= len(results):
 941
 942
 943         range_end = len(results)
 944       else:
 945
 946
 947
 948         range_end = limit
 949         if params.cursor_type() == search_service_pb.SearchParams.SINGLE:
 950           document = results[range_end - 1].document
 951           response.set_cursor(self._EncodeCursor(document))
 952       result_range = range(offset, range_end)
 953     else:
 954       result_range = range(0)
 955     field_names = params.field_spec().name_list()
 956     self._FillSearchResponse(results, result_range, params.cursor_type(),
 957                              _ScoreRequested(params), response, field_names,
 958                              params.keys_only())
 959
 960     response.mutable_status().set_code(search_service_pb.SearchServiceError.OK)
 961
 962   def _EncodeCursor(self, document):
 963     return base64.urlsafe_b64encode(document.id())
 964
 965   def _DecodeCursor(self, cursor):
 966     return base64.urlsafe_b64decode(cursor)
 967
 968   def __repr__(self):
 969     return search_util.Repr(self, [('__indexes', self.__indexes)])
 970
 971   def Write(self):
 972     """Write search indexes to the index file.
 973
 974     This method is a no-op if index_file is set to None.
 975     """
 976     if not self.__index_file:
 977       return
 978
 979
 980
 981
 982
 983     descriptor, tmp_filename = tempfile.mkstemp(
 984         dir=os.path.dirname(self.__index_file))
 985     tmpfile = os.fdopen(descriptor, 'wb')
 986
 987     pickler = pickle.Pickler(tmpfile, protocol=1)
 988     pickler.fast = True
 989     pickler.dump((self._VERSION, self.__indexes))
 990
 991     tmpfile.close()
 992
 993     self.__index_file_lock.acquire()
 994     try:
 995       try:
 996
 997         os.rename(tmp_filename, self.__index_file)
 998       except OSError:
 999
1000
1001         os.remove(self.__index_file)
1002         os.rename(tmp_filename, self.__index_file)
1003     finally:
1004       self.__index_file_lock.release()
1005
1006   def _ReadFromFile(self):
1007     self.__index_file_lock.acquire()
1008     try:
1009       if os.path.isfile(self.__index_file):
1010         version, indexes = pickle.load(open(self.__index_file, 'rb'))
1011         if version == self._VERSION:
1012           return indexes
1013         logging.warning(
1014             'Saved search indexes are not compatible with this version of the '
1015             'SDK. Search indexes have been cleared.')
1016       else:
1017         logging.warning(
1018             'Could not read search indexes from %s', self.__index_file)
1019     except (AttributeError, LookupError, ImportError, NameError, TypeError,
1020             ValueError, pickle.PickleError, IOError), e:
1021       logging.warning(
1022           'Could not read indexes from %s. Try running with the '
1023           '--clear_search_index flag. Cause:\n%r' % (self.__index_file, e))
1024     finally:
1025       self.__index_file_lock.release()
1026
1027     return {}
1028
1029   def Read(self):
1030     """Read search indexes from the index file.
1031
1032     This method is a no-op if index_file is set to None.
1033     """
1034     if not self.__index_file:
1035       return
1036     read_indexes = self._ReadFromFile()
1037     if read_indexes:
1038       self.__indexes = read_indexes