python/google/appengine/api/search/simple_search_stub.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright 2007 Google Inc.
   4 #
   5 # Licensed under the Apache License, Version 2.0 (the "License");
   6 # you may not use this file except in compliance with the License.
   7 # You may obtain a copy of the License at
   8 #
   9 #     http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing, software
  12 # distributed under the License is distributed on an "AS IS" BASIS,
  13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 # See the License for the specific language governing permissions and
  15 # limitations under the License.
  16 #
  17
  18
  19
  20
  21 """Simple RAM backed Search API stub."""
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32 import base64
  33 import bisect
  34 import copy
  35 import cPickle as pickle
  36 import datetime
  37 import functools
  38 import hashlib
  39 import logging
  40 import math
  41 import os
  42 import random
  43 import string
  44 import tempfile
  45 import threading
  46 import urllib
  47 import uuid
  48
  49 from google.appengine.datastore import document_pb
  50 from google.appengine.api import apiproxy_stub
  51 from google.appengine.api.namespace_manager import namespace_manager
  52 from google.appengine.api.search import query_parser
  53 from google.appengine.api.search import QueryParser
  54 from google.appengine.api.search import search
  55 from google.appengine.api.search import search_service_pb
  56 from google.appengine.api.search import search_util
  57 from google.appengine.api.search.stub import document_matcher
  58 from google.appengine.api.search.stub import expression_evaluator
  59 from google.appengine.api.search.stub import simple_tokenizer
  60 from google.appengine.api.search.stub import tokens
  61 from google.appengine.runtime import apiproxy_errors
  62
  63 __all__ = ['IndexConsistencyError',
  64            'Posting',
  65            'PostingList',
  66            'RamInvertedIndex',
  67            'SearchServiceStub',
  68            'SimpleIndex',
  69            'FieldTypesDict',
  70           ]
  71
  72 _VISIBLE_PRINTABLE_ASCII = frozenset(
  73     set(string.printable) - set(string.whitespace))
  74
  75 _FAILED_TO_PARSE_SEARCH_REQUEST = 'Failed to parse search request \"%s\"; %s'
  76
  77 class _InvalidCursorException(Exception):
  78   """Raised when parsing a cursor fails."""
  79
  80 class IndexConsistencyError(Exception):
  81   """Deprecated 1.7.7. Accessed index with same name different consistency."""
  82
  83
  84 class Posting(object):
  85   """Represents a occurrences of some token at positions in a document."""
  86
  87   def __init__(self, doc_id):
  88     """Initializer.
  89
  90     Args:
  91       doc_id: The identifier of the document with token occurrences.
  92
  93     Raises:
  94       TypeError: If an unknown argument is passed.
  95     """
  96     self._doc_id = doc_id
  97     self._positions = []
  98
  99   @property
 100   def doc_id(self):
 101     """Return id of the document that the token occurred in."""
 102     return self._doc_id
 103
 104   def AddPosition(self, position):
 105     """Adds the position in token sequence to occurrences for token."""
 106     pos = bisect.bisect_left(self._positions, position)
 107     if pos < len(self._positions) and self._positions[pos] == position:
 108       return
 109     self._positions.insert(pos, position)
 110
 111   def RemovePosition(self, position):
 112     """Removes the position in token sequence from occurrences for token."""
 113     pos = bisect.bisect_left(self._positions, position)
 114     if pos < len(self._positions) and self._positions[pos] == position:
 115       del self._positions[pos]
 116
 117   def __cmp__(self, other):
 118     if not isinstance(other, Posting):
 119       return -2
 120     return cmp(self.doc_id, other.doc_id)
 121
 122   @property
 123   def positions(self):
 124     return self._positions
 125
 126   def __repr__(self):
 127     return search_util.Repr(
 128         self, [('doc_id', self.doc_id), ('positions', self.positions)])
 129
 130
 131 class PostingList(object):
 132   """Represents ordered positions of some token in document.
 133
 134   A PostingList consists of a document id and a sequence of positions
 135   that the same token occurs in the document.
 136   """
 137
 138   def __init__(self):
 139     self._postings = []
 140
 141   def Add(self, doc_id, position):
 142     """Adds the token position for the given doc_id."""
 143     posting = Posting(doc_id=doc_id)
 144     pos = bisect.bisect_left(self._postings, posting)
 145     if pos < len(self._postings) and self._postings[
 146         pos].doc_id == posting.doc_id:
 147       posting = self._postings[pos]
 148     else:
 149       self._postings.insert(pos, posting)
 150     posting.AddPosition(position)
 151
 152   def Remove(self, doc_id, position):
 153     """Removes the token position for the given doc_id."""
 154     posting = Posting(doc_id=doc_id)
 155     pos = bisect.bisect_left(self._postings, posting)
 156     if pos < len(self._postings) and self._postings[
 157         pos].doc_id == posting.doc_id:
 158       posting = self._postings[pos]
 159       posting.RemovePosition(position)
 160       if not posting.positions:
 161         del self._postings[pos]
 162
 163   @property
 164   def postings(self):
 165     return self._postings
 166
 167   def __iter__(self):
 168     return iter(self._postings)
 169
 170   def __repr__(self):
 171     return search_util.Repr(self, [('postings', self.postings)])
 172
 173
 174 class _ScoredDocument(object):
 175   """A scored document_pb.Document."""
 176
 177   def __init__(self, document, score):
 178     self._document = document
 179     self._score = score
 180     self._expressions = {}
 181
 182   @property
 183   def document(self):
 184     return self._document
 185
 186   @property
 187   def score(self):
 188     return self._score
 189
 190   @property
 191   def expressions(self):
 192     return self._expressions
 193
 194   def __repr__(self):
 195     return search_util.Repr(
 196         self, [('document', self.document), ('score', self.score)])
 197
 198
 199 class _DocumentStatistics(object):
 200   """Statistics about terms occuring in a document."""
 201
 202   def __init__(self):
 203     self._term_stats = {}
 204
 205   def __iter__(self):
 206     for item in self._term_stats.items():
 207       yield item
 208
 209   def IncrementTermCount(self, term):
 210     """Adds an occurrence of the term to the stats for the document."""
 211     count = 0
 212     if term in self._term_stats:
 213       count = self._term_stats[term]
 214     count += 1
 215     self._term_stats[term] = count
 216
 217   def TermFrequency(self, term):
 218     """Returns the term frequency in the document."""
 219     if term not in self._term_stats:
 220       return 0
 221     return self._term_stats[term]
 222
 223   @property
 224   def term_stats(self):
 225     """Returns the collection of term frequencies in the document."""
 226     return self._term_stats
 227
 228   def __eq__(self, other):
 229     return self.term_stats == other.term_stats
 230
 231   def __hash__(self):
 232     return hash(self.term_stats)
 233
 234   def __repr__(self):
 235     return search_util.Repr(self, [('term_stats', self.term_stats)])
 236
 237
 238 class FieldTypesDict(object):
 239   """Dictionary-like object for type mappings."""
 240
 241   def __init__(self):
 242     self._field_types = []
 243
 244   def __contains__(self, name):
 245     return name in [ f.name() for f in self._field_types ]
 246
 247   def __getitem__(self, name):
 248     for f in self._field_types:
 249       if name == f.name():
 250         return f
 251     raise KeyError, name
 252
 253   def IsType(self, name, field_type):
 254     if name not in self:
 255       return False
 256     schema_type = self[name]
 257     return field_type in schema_type.type_list()
 258
 259   def AddFieldType(self, name, field_type):
 260     field_types = None
 261     for f in self._field_types:
 262       if name == f.name():
 263         field_types = f
 264     if field_types is None:
 265       field_types = document_pb.FieldTypes()
 266       field_types.set_name(name)
 267       self._field_types.append(field_types)
 268     if field_type not in field_types.type_list():
 269       field_types.add_type(field_type)
 270
 271   def __iter__(self):
 272     return iter(sorted([f.name() for f in self._field_types]))
 273
 274   def __repr__(self):
 275     return repr(self._field_types)
 276
 277 class RamInvertedIndex(object):
 278   """A simple RAM-resident inverted file over documents."""
 279
 280   def __init__(self, tokenizer):
 281     self._tokenizer = tokenizer
 282     self._inverted_index = {}
 283     self._schema = FieldTypesDict()
 284     self._document_ids = set([])
 285
 286   def _AddDocumentId(self, doc_id):
 287     """Adds the doc_id to set in index."""
 288     self._document_ids.add(doc_id)
 289
 290   def _RemoveDocumentId(self, doc_id):
 291     """Removes the doc_id from the set in index."""
 292     if doc_id in self._document_ids:
 293       self._document_ids.remove(doc_id)
 294
 295   @property
 296   def document_count(self):
 297     return len(self._document_ids)
 298
 299   def _AddFieldType(self, name, field_type):
 300     """Adds the type to the list supported for a named field."""
 301     self._schema.AddFieldType(name, field_type)
 302
 303   def GetDocumentStats(self, document):
 304     """Gets statistics about occurrences of terms in document."""
 305     document_stats = _DocumentStatistics()
 306     for field in document.field_list():
 307       for token in self._tokenizer.TokenizeValue(field_value=field.value()):
 308         document_stats.IncrementTermCount(token.chars)
 309     return document_stats
 310
 311   def AddDocument(self, doc_id, document):
 312     """Adds a document into the index."""
 313     token_position = 0
 314     for field in document.field_list():
 315       self._AddFieldType(field.name(), field.value().type())
 316       self._AddTokens(doc_id, field.name(), field.value(), token_position)
 317     self._AddDocumentId(doc_id)
 318
 319   def RemoveDocument(self, document):
 320     """Removes a document from the index."""
 321     doc_id = document.id()
 322     for field in document.field_list():
 323       self._RemoveTokens(doc_id, field.name(), field.value())
 324     self._RemoveDocumentId(doc_id)
 325
 326   def _AddTokens(self, doc_id, field_name, field_value, token_position):
 327     """Adds token occurrences for a given doc's field value."""
 328     for token in self._tokenizer.TokenizeValue(field_value, token_position):
 329       self._AddToken(doc_id, token)
 330       self._AddToken(doc_id, token.RestrictField(field_name))
 331
 332   def _RemoveTokens(self, doc_id, field_name, field_value):
 333     """Removes tokens occurrences for a given doc's field value."""
 334     for token in self._tokenizer.TokenizeValue(field_value=field_value):
 335       self._RemoveToken(doc_id, token)
 336       self._RemoveToken(doc_id, token.RestrictField(field_name))
 337
 338   def _AddToken(self, doc_id, token):
 339     """Adds a token occurrence for a document."""
 340     postings = self._inverted_index.get(token)
 341     if postings is None:
 342       self._inverted_index[token] = postings = PostingList()
 343     postings.Add(doc_id, token.position)
 344
 345   def _RemoveToken(self, doc_id, token):
 346     """Removes a token occurrence for a document."""
 347     if token in self._inverted_index:
 348       postings = self._inverted_index[token]
 349       postings.Remove(doc_id, token.position)
 350       if not postings.postings:
 351         del self._inverted_index[token]
 352
 353   def GetPostingsForToken(self, token):
 354     """Returns all document postings which for the token."""
 355     if token in self._inverted_index:
 356       return self._inverted_index[token].postings
 357     return []
 358
 359   def GetSchema(self):
 360     """Returns the schema for the index."""
 361     return self._schema
 362
 363   def __repr__(self):
 364     return search_util.Repr(self, [('_inverted_index', self._inverted_index),
 365                                    ('_schema', self._schema),
 366                                    ('document_count', self.document_count)])
 367
 368
 369 def _ScoreRequested(params):
 370   """Returns True if match scoring requested, False otherwise."""
 371   return params.has_scorer_spec() and params.scorer_spec().has_scorer()
 372
 373
 374 class SimpleIndex(object):
 375   """A simple search service which uses a RAM-resident inverted file."""
 376
 377   def __init__(self, index_spec):
 378     self._index_spec = index_spec
 379     self._documents = {}
 380     self._parser = simple_tokenizer.SimpleTokenizer(split_restricts=False)
 381     self._inverted_index = RamInvertedIndex(simple_tokenizer.SimpleTokenizer())
 382
 383   @property
 384   def index_spec(self):
 385     """Returns the index specification for the index."""
 386     return self._index_spec
 387
 388   def IndexDocuments(self, documents, response):
 389     """Indexes an iterable DocumentPb.Document."""
 390     for document in documents:
 391       doc_id = document.id()
 392       if not doc_id:
 393         doc_id = str(uuid.uuid4())
 394         document.set_id(doc_id)
 395
 396
 397
 398
 399
 400
 401
 402       try:
 403         search._NewDocumentFromPb(document)
 404       except ValueError, e:
 405         new_status = response.add_status()
 406         new_status.set_code(search_service_pb.SearchServiceError.INVALID_REQUEST)
 407         new_status.set_error_detail(e.message)
 408         continue
 409       response.add_doc_id(doc_id)
 410       if doc_id in self._documents:
 411         old_document = self._documents[doc_id]
 412         self._inverted_index.RemoveDocument(old_document)
 413       self._documents[doc_id] = document
 414       new_status = response.add_status()
 415       new_status.set_code(search_service_pb.SearchServiceError.OK)
 416       self._inverted_index.AddDocument(doc_id, document)
 417
 418   def DeleteDocuments(self, document_ids, response):
 419     """Deletes documents for the given document_ids."""
 420     for document_id in document_ids:
 421       self.DeleteDocument(document_id, response.add_status())
 422
 423   def DeleteDocument(self, document_id, delete_status):
 424     """Deletes the document, if any, with the given document_id."""
 425     if document_id in self._documents:
 426       document = self._documents[document_id]
 427       self._inverted_index.RemoveDocument(document)
 428       del self._documents[document_id]
 429       delete_status.set_code(search_service_pb.SearchServiceError.OK)
 430     else:
 431       delete_status.set_code(search_service_pb.SearchServiceError.OK)
 432       delete_status.set_error_detail('Not found')
 433
 434   def Documents(self):
 435     """Returns the documents in the index."""
 436     return self._documents.values()
 437
 438   def _TermFrequency(self, term, document):
 439     """Return the term frequency in the document."""
 440     return self._inverted_index.GetDocumentStats(document).TermFrequency(term)
 441
 442   @property
 443   def document_count(self):
 444     """Returns the count of documents in the index."""
 445     return self._inverted_index.document_count
 446
 447   def _DocumentCountForTerm(self, term):
 448     """Returns the document count for documents containing the term."""
 449     return len(self._PostingsForToken(tokens.Token(chars=term)))
 450
 451   def _InverseDocumentFrequency(self, term):
 452     """Returns inverse document frequency of term."""
 453     doc_count = self._DocumentCountForTerm(term)
 454     if doc_count:
 455       return math.log10(self.document_count / float(doc_count))
 456     else:
 457       return 0
 458
 459   def _TermFrequencyInverseDocumentFrequency(self, term, document):
 460     """Returns the term frequency times inverse document frequency of term."""
 461     return (self._TermFrequency(term, document) *
 462             self._InverseDocumentFrequency(term))
 463
 464   def _ScoreDocument(self, document, score, terms):
 465     """Scores a document for the given query."""
 466     if not score:
 467       return 0
 468     tf_idf = 0
 469     for term in terms:
 470       tf_idf += self._TermFrequencyInverseDocumentFrequency(term, document)
 471     return tf_idf
 472
 473   def _PostingsForToken(self, token):
 474     """Returns the postings for the token."""
 475     return self._inverted_index.GetPostingsForToken(token)
 476
 477   def _CollectTerms(self, node):
 478     """Get all search terms for scoring."""
 479     if node.getType() in search_util.TEXT_QUERY_TYPES:
 480       return set([query_parser.GetQueryNodeText(node).strip('"')])
 481     elif node.children:
 482       if node.getType() == QueryParser.EQ and len(node.children) > 1:
 483         children = node.children[1:]
 484       else:
 485         children = node.children
 486
 487       result = set()
 488       for term_set in (self._CollectTerms(child) for child in children):
 489         result.update(term_set)
 490       return result
 491     return set()
 492
 493   def _CollectFields(self, node):
 494     if node.getType() == QueryParser.EQ and node.children:
 495       return set([query_parser.GetQueryNodeText(node.children[0])])
 496     elif node.children:
 497       result = set()
 498       for term_set in (self._CollectFields(child) for child in node.children):
 499         result.update(term_set)
 500       return result
 501     return set()
 502
 503   def _Evaluate(self, node, score=True):
 504     """Retrieve scored results for a search query."""
 505     doc_match = document_matcher.DocumentMatcher(node, self._inverted_index)
 506
 507     matched_documents = doc_match.FilterDocuments(self._documents.itervalues())
 508     terms = self._CollectTerms(node)
 509     scored_documents = [
 510         _ScoredDocument(doc, self._ScoreDocument(doc, score, terms))
 511         for doc in matched_documents]
 512     return scored_documents
 513
 514   def _Sort(self, docs, search_params, query, score):
 515     """Return sorted docs with score or evaluated search_params as sort key."""
 516
 517
 518
 519     docs = sorted(docs, key=lambda doc: doc.document.order_id(), reverse=True)
 520
 521     if not search_params.sort_spec_size():
 522       if score:
 523         return sorted(docs, key=lambda doc: doc.score, reverse=True)
 524       return docs
 525
 526     def SortKey(scored_doc):
 527       """Return the sort key for a document based on the request parameters.
 528
 529       Arguments:
 530         scored_doc: The document to score
 531
 532       Returns:
 533         The sort key of a document. The sort key is a tuple, where the nth
 534         element in the tuple corresponds to the value of the nth sort expression
 535         evaluated on the document.
 536
 537       Raises:
 538         Exception: if no default value is specified.
 539       """
 540       expr_vals = []
 541       for sort_spec in search_params.sort_spec_list():
 542         default_text = None
 543         default_numeric = None
 544         if sort_spec.has_default_value_text():
 545           default_text = sort_spec.default_value_text()
 546         if sort_spec.has_default_value_numeric():
 547           default_numeric = sort_spec.default_value_numeric()
 548         try:
 549           text_val = expression_evaluator.ExpressionEvaluator(
 550               scored_doc, self._inverted_index, True).ValueOf(
 551                   sort_spec.sort_expression(), default_value=default_text,
 552                   return_type=search_util.EXPRESSION_RETURN_TYPE_TEXT)
 553           num_val = expression_evaluator.ExpressionEvaluator(
 554               scored_doc, self._inverted_index, True).ValueOf(
 555                   sort_spec.sort_expression(), default_value=default_numeric,
 556                   return_type=search_util.EXPRESSION_RETURN_TYPE_NUMERIC)
 557         except expression_evaluator.QueryExpressionEvaluationError, e:
 558           raise expression_evaluator.ExpressionEvaluationError(
 559               _FAILED_TO_PARSE_SEARCH_REQUEST % (query, e))
 560         if isinstance(num_val, datetime.datetime):
 561           num_val = search_util.EpochTime(num_val)
 562
 563
 564         elif isinstance(text_val, datetime.datetime):
 565           num_val = search_util.EpochTime(text_val)
 566
 567         if text_val is None:
 568           text_val = ''
 569         if num_val is None:
 570           num_val = 0
 571         expr_vals.append([text_val, num_val])
 572       return tuple(expr_vals)
 573
 574     def SortCmp(x, y):
 575       """The comparison function for sort keys."""
 576
 577
 578       for i, val_tuple in enumerate(zip(x, y)):
 579         cmp_val = cmp(*val_tuple)
 580         if cmp_val:
 581           if search_params.sort_spec(i).sort_descending():
 582             return -cmp_val
 583           return cmp_val
 584       return 0
 585     return sorted(docs, key=SortKey, cmp=SortCmp)
 586
 587   def _AttachExpressions(self, docs, search_params):
 588     if search_params.has_field_spec():
 589       for doc in docs:
 590         evaluator = expression_evaluator.ExpressionEvaluator(
 591             doc, self._inverted_index)
 592         for expr in search_params.field_spec().expression_list():
 593           evaluator.Evaluate(expr)
 594     return docs
 595
 596   def Search(self, search_request):
 597     """Searches the simple index for ."""
 598     query = urllib.unquote(search_request.query())
 599     query = query.strip()
 600     score = _ScoreRequested(search_request)
 601     if not query:
 602       docs = [_ScoredDocument(doc, 0.0) for doc in self._documents.values()]
 603     else:
 604       if not isinstance(query, unicode):
 605         query = unicode(query, 'utf-8')
 606       query_tree = query_parser.ParseAndSimplify(query)
 607       docs = self._Evaluate(query_tree, score=score)
 608     docs = self._Sort(docs, search_request, query, score)
 609     docs = self._AttachExpressions(docs, search_request)
 610     return docs
 611
 612   def GetSchema(self):
 613     """Returns the schema for the index."""
 614     return self._inverted_index.GetSchema()
 615
 616   def __repr__(self):
 617     return search_util.Repr(self, [('_index_spec', self._index_spec),
 618                                    ('_documents', self._documents),
 619                                    ('_inverted_index', self._inverted_index)])
 620
 621
 622 class SearchServiceStub(apiproxy_stub.APIProxyStub):
 623   """Simple RAM backed Search service stub.
 624
 625   This stub provides the search_service_pb.SearchService.  But this is
 626   NOT a subclass of SearchService itself.  Services are provided by
 627   the methods prefixed by "_Dynamic_".
 628   """
 629
 630
 631
 632
 633   _VERSION = 1
 634
 635
 636
 637
 638
 639   _MAX_STORAGE_LIMIT = 1024 * 1024 * 1024
 640
 641   def __init__(self, service_name='search', index_file=None):
 642     """Constructor.
 643
 644     Args:
 645       service_name: Service name expected for all calls.
 646       index_file: The file to which search indexes will be persisted.
 647     """
 648     self.__indexes = {}
 649     self.__index_file = index_file
 650     self.__index_file_lock = threading.Lock()
 651     super(SearchServiceStub, self).__init__(service_name)
 652
 653     self.Read()
 654
 655   def _InvalidRequest(self, status, exception):
 656     status.set_code(search_service_pb.SearchServiceError.INVALID_REQUEST)
 657     status.set_error_detail(exception.message)
 658
 659   def _UnknownIndex(self, status, index_spec):
 660     status.set_code(search_service_pb.SearchServiceError.OK)
 661     status.set_error_detail(
 662         "Index '%s' in namespace '%s' does not exist" %
 663         (index_spec.name(), index_spec.namespace()))
 664
 665   def _GetNamespace(self, namespace):
 666     """Get namespace name.
 667
 668     Args:
 669       namespace: Namespace provided in request arguments.
 670
 671     Returns:
 672       If namespace is None, returns the name of the current global namespace. If
 673       namespace is not None, returns namespace.
 674     """
 675     if namespace is not None:
 676       return namespace
 677     return namespace_manager.get_namespace()
 678
 679   def _GetIndex(self, index_spec, create=False):
 680     namespace = self._GetNamespace(index_spec.namespace())
 681
 682     index = self.__indexes.setdefault(namespace, {}).get(index_spec.name())
 683     if index is None and create:
 684       index = SimpleIndex(index_spec)
 685       self.__indexes[namespace][index_spec.name()] = index
 686     return index
 687
 688   def _Dynamic_IndexDocument(self, request, response):
 689     """A local implementation of SearchService.IndexDocument RPC.
 690
 691     Index a new document or update an existing document.
 692
 693     Args:
 694       request: A search_service_pb.IndexDocumentRequest.
 695       response: An search_service_pb.IndexDocumentResponse.
 696     """
 697     params = request.params()
 698     index = self._GetIndex(params.index_spec(), create=True)
 699     index.IndexDocuments(params.document_list(), response)
 700
 701   def _Dynamic_DeleteDocument(self, request, response):
 702     """A local implementation of SearchService.DeleteDocument RPC.
 703
 704     Args:
 705       request: A search_service_pb.DeleteDocumentRequest.
 706       response: An search_service_pb.DeleteDocumentResponse.
 707     """
 708     params = request.params()
 709     index_spec = params.index_spec()
 710     index = self._GetIndex(index_spec)
 711     for document_id in params.doc_id_list():
 712       delete_status = response.add_status()
 713       if index is None:
 714         delete_status.set_code(search_service_pb.SearchServiceError.OK)
 715         delete_status.set_error_detail('Not found')
 716       else:
 717         index.DeleteDocument(document_id, delete_status)
 718
 719   def _Dynamic_ListIndexes(self, request, response):
 720     """A local implementation of SearchService.ListIndexes RPC.
 721
 722     Args:
 723       request: A search_service_pb.ListIndexesRequest.
 724       response: An search_service_pb.ListIndexesResponse.
 725
 726     Raises:
 727       ResponseTooLargeError: raised for testing admin console.
 728     """
 729
 730
 731
 732     if request.has_app_id():
 733       if random.choice([True] + [False] * 9):
 734         raise apiproxy_errors.ResponseTooLargeError()
 735
 736       for _ in xrange(random.randint(0, 2) * random.randint(5, 15)):
 737         new_index_spec = response.add_index_metadata().mutable_index_spec()
 738         new_index_spec.set_name(
 739             random.choice(list(_VISIBLE_PRINTABLE_ASCII - set('!'))) +
 740             ''.join(random.choice(list(_VISIBLE_PRINTABLE_ASCII))
 741                     for _ in xrange(random.randint(
 742                         0, search.MAXIMUM_INDEX_NAME_LENGTH))))
 743       response.mutable_status().set_code(
 744           random.choice([search_service_pb.SearchServiceError.OK] * 10 +
 745                         [search_service_pb.SearchServiceError.TRANSIENT_ERROR] +
 746                         [search_service_pb.SearchServiceError.INTERNAL_ERROR]))
 747       return
 748
 749     response.mutable_status().set_code(
 750         search_service_pb.SearchServiceError.OK)
 751
 752     namespace = self._GetNamespace(request.params().namespace())
 753     if namespace not in self.__indexes or not self.__indexes[namespace]:
 754       return
 755
 756     keys, indexes = zip(*sorted(
 757         self.__indexes[namespace].iteritems(), key=lambda v: v[0]))
 758     position = 0
 759     params = request.params()
 760     if params.has_start_index_name():
 761       position = bisect.bisect_left(keys, params.start_index_name())
 762       if (not params.include_start_index() and position < len(keys)
 763           and keys[position] == params.start_index_name()):
 764         position += 1
 765     elif params.has_index_name_prefix():
 766       position = bisect.bisect_left(keys, params.index_name_prefix())
 767     if params.has_offset():
 768       position += params.offset()
 769     end_position = position + params.limit()
 770     prefix = params.index_name_prefix()
 771     for index in indexes[min(position, len(keys)):min(end_position, len(keys))]:
 772       index_spec = index.index_spec
 773       if prefix and not index_spec.name().startswith(prefix):
 774         break
 775       metadata = response.add_index_metadata()
 776       new_index_spec = metadata.mutable_index_spec()
 777       new_index_spec.set_name(index_spec.name())
 778       new_index_spec.set_namespace(index_spec.namespace())
 779       if params.fetch_schema():
 780         self._AddSchemaInformation(index, metadata)
 781       self._AddStorageInformation(index, metadata)
 782
 783   def _AddSchemaInformation(self, index, metadata_pb):
 784     schema = index.GetSchema()
 785     for name in schema:
 786       field_types = schema[name]
 787       new_field_types = metadata_pb.add_field()
 788       new_field_types.MergeFrom(field_types)
 789
 790   def _AddStorageInformation(self, index, metadata_pb):
 791     total_usage = 0
 792     for document in index.Documents():
 793
 794
 795
 796       for field in document.field_list():
 797         total_usage += field.ByteSize()
 798       total_usage += len(document.id())
 799     storage = metadata_pb.mutable_storage()
 800     storage.set_amount_used(total_usage)
 801     storage.set_limit(self._MAX_STORAGE_LIMIT)
 802
 803   def _AddDocument(self, response, document, ids_only):
 804     doc = response.add_document()
 805     if ids_only:
 806       doc.set_id(document.id())
 807     else:
 808       doc.MergeFrom(document)
 809
 810   def _Dynamic_ListDocuments(self, request, response):
 811     """A local implementation of SearchService.ListDocuments RPC.
 812
 813     Args:
 814       request: A search_service_pb.ListDocumentsRequest.
 815       response: An search_service_pb.ListDocumentsResponse.
 816     """
 817     params = request.params()
 818     index = self._GetIndex(params.index_spec())
 819     if index is None:
 820       response.mutable_status().set_code(
 821           search_service_pb.SearchServiceError.OK)
 822       return
 823
 824     num_docs = 0
 825     start = not params.has_start_doc_id()
 826     for document in sorted(index.Documents(), key=lambda doc: doc.id()):
 827       if start:
 828         if num_docs < params.limit():
 829           self._AddDocument(response, document, params.keys_only())
 830           num_docs += 1
 831       else:
 832         if document.id() >= params.start_doc_id():
 833           start = True
 834           if (document.id() != params.start_doc_id() or
 835               params.include_start_doc()):
 836             self._AddDocument(response, document, params.keys_only())
 837             num_docs += 1
 838
 839     response.mutable_status().set_code(
 840         search_service_pb.SearchServiceError.OK)
 841
 842   def _RandomSearchResponse(self, request, response):
 843
 844     random.seed()
 845     if random.random() < 0.03:
 846       raise apiproxy_errors.ResponseTooLargeError()
 847     response.mutable_status().set_code(
 848         random.choice([search_service_pb.SearchServiceError.OK] * 30 +
 849                       [search_service_pb.SearchServiceError.TRANSIENT_ERROR] +
 850                       [search_service_pb.SearchServiceError.INTERNAL_ERROR]))
 851
 852     params = request.params()
 853     random.seed(params.query())
 854     total = random.randint(0, 100)
 855
 856
 857     if random.random() < 0.3:
 858       total = 0
 859
 860     offset = 0
 861     if params.has_offset():
 862       offset = params.offset()
 863
 864     remaining = max(0, total - offset)
 865     nresults = min(remaining, params.limit())
 866     matched_count = offset + nresults
 867     if remaining > nresults:
 868       matched_count += random.randint(1, 100)
 869
 870     def RandomText(charset, min_len, max_len):
 871       return ''.join(random.choice(charset)
 872                      for _ in xrange(random.randint(min_len, max_len)))
 873
 874     for i in xrange(nresults):
 875       seed = '%s:%s' % (params.query(), i + offset)
 876       random.seed(seed)
 877       result = response.add_result()
 878       doc = result.mutable_document()
 879       doc_id = RandomText(string.letters + string.digits, 8, 10)
 880       doc.set_id(doc_id)
 881       random.seed(doc_id)
 882       for _ in params.sort_spec_list():
 883         result.add_score(random.random())
 884
 885       for name, probability in [('creator', 0.90), ('last_change', 0.40)]:
 886         if random.random() < probability:
 887           field = doc.add_field()
 888           field.set_name(name)
 889           value = field.mutable_value()
 890           value.set_type(document_pb.FieldValue.TEXT)
 891           value.set_string_value(
 892               RandomText(string.letters + string.digits, 2, 10)
 893               + '@google.com')
 894
 895       field = doc.add_field()
 896       field.set_name('content')
 897       value = field.mutable_value()
 898       value.set_type(document_pb.FieldValue.TEXT)
 899       value.set_string_value(
 900           RandomText(string.printable, 0, 15) + params.query() +
 901           RandomText(string.printable + 10 * string.whitespace, 5, 5000))
 902
 903       for i in xrange(random.randint(0, 2)):
 904         field = doc.add_field()
 905         field.set_name(RandomText(string.letters, 3, 7))
 906         value = field.mutable_value()
 907         value.set_type(document_pb.FieldValue.TEXT)
 908         value.set_string_value(RandomText(string.printable, 0, 100))
 909
 910     response.set_matched_count(matched_count)
 911
 912   def _DefaultFillSearchResponse(self, params, results, response):
 913     """Fills the SearchResponse with the first set of results."""
 914     position_range = range(0, min(params.limit(), len(results)))
 915     self._FillSearchResponse(results, position_range, params.cursor_type(),
 916                              _ScoreRequested(params), response)
 917
 918   def _CopyDocument(self, doc, doc_copy, field_names, ids_only=None):
 919     """Copies Document, doc, to doc_copy restricting fields to field_names."""
 920     doc_copy.set_id(doc.id())
 921     if ids_only:
 922       return
 923     if doc.has_language():
 924       doc_copy.set_language(doc.language())
 925     for field in doc.field_list():
 926       if not field_names or field.name() in field_names:
 927         doc_copy.add_field().CopyFrom(field)
 928     doc_copy.set_order_id(doc.order_id())
 929
 930   def _FillSearchResponse(self, results, position_range, cursor_type, score,
 931                           response, field_names=None, ids_only=None):
 932     """Fills the SearchResponse with a selection of results."""
 933     for i in position_range:
 934       result = results[i]
 935       search_result = response.add_result()
 936       self._CopyDocument(result.document, search_result.mutable_document(),
 937                          field_names, ids_only)
 938       if cursor_type == search_service_pb.SearchParams.PER_RESULT:
 939         search_result.set_cursor(self._EncodeCursor(result.document))
 940       if score:
 941         search_result.add_score(result.score)
 942       for field, expression in result.expressions.iteritems():
 943         expr = search_result.add_expression()
 944         expr.set_name(field)
 945         if (isinstance(expression, float) or
 946             isinstance(expression, long) or
 947             isinstance(expression, int)):
 948           expr.mutable_value().set_string_value(str(expression))
 949           expr.mutable_value().set_type(document_pb.FieldValue.NUMBER)
 950         else:
 951           expr.mutable_value().set_string_value(expression)
 952           expr.mutable_value().set_type(document_pb.FieldValue.HTML)
 953
 954   def _Dynamic_Search(self, request, response):
 955     """A local implementation of SearchService.Search RPC.
 956
 957     Args:
 958       request: A search_service_pb.SearchRequest.
 959       response: An search_service_pb.SearchResponse.
 960     """
 961     if request.has_app_id():
 962       self._RandomSearchResponse(request, response)
 963       return
 964
 965     index = self._GetIndex(request.params().index_spec())
 966     if index is None:
 967       self._UnknownIndex(response.mutable_status(),
 968                          request.params().index_spec())
 969       response.set_matched_count(0)
 970       return
 971
 972     params = request.params()
 973     try:
 974       results = index.Search(params)
 975     except query_parser.QueryException, e:
 976       self._InvalidRequest(response.mutable_status(), e)
 977       response.set_matched_count(0)
 978       return
 979     except expression_evaluator.ExpressionEvaluationError, e:
 980       self._InvalidRequest(response.mutable_status(), e)
 981       response.set_matched_count(0)
 982       return
 983     except document_matcher.ExpressionTreeException, e:
 984       self._InvalidRequest(response.mutable_status(), e)
 985       response.set_matched_count(0)
 986       return
 987     response.set_matched_count(len(results))
 988
 989     offset = 0
 990     if params.has_cursor():
 991       try:
 992         doc_id = self._DecodeCursor(params.cursor())
 993       except _InvalidCursorException, e:
 994         self._InvalidRequest(response.mutable_status(), e)
 995         response.set_matched_count(0)
 996         return
 997       for i, result in enumerate(results):
 998         if result.document.id() == doc_id:
 999           offset = i + 1
1000           break
1001     elif params.has_offset():
1002       offset = params.offset()
1003
1004
1005
1006     if offset < len(results):
1007
1008
1009       limit = offset + params.limit()
1010       if limit >= len(results):
1011
1012
1013         range_end = len(results)
1014       else:
1015
1016
1017
1018         range_end = limit
1019         if params.cursor_type() == search_service_pb.SearchParams.SINGLE:
1020           document = results[range_end - 1].document
1021           response.set_cursor(self._EncodeCursor(document))
1022       result_range = range(offset, range_end)
1023     else:
1024       result_range = range(0)
1025     field_names = params.field_spec().name_list()
1026     self._FillSearchResponse(results, result_range, params.cursor_type(),
1027                              _ScoreRequested(params), response, field_names,
1028                              params.keys_only())
1029
1030     response.mutable_status().set_code(search_service_pb.SearchServiceError.OK)
1031
1032   def _EncodeCursor(self, document):
1033     doc_id_hash = hashlib.sha224(document.id()).hexdigest()
1034     cursor = doc_id_hash + '|' + document.id()
1035     return base64.urlsafe_b64encode(cursor)
1036
1037   def _DecodeCursor(self, encoded_cursor):
1038     cursor = base64.urlsafe_b64decode(encoded_cursor)
1039     separator = cursor.find('|')
1040     if separator < 0:
1041       raise _InvalidCursorException('Invalid cursor string: ' + encoded_cursor)
1042     doc_id_hash = cursor[:separator]
1043     doc_id = cursor[separator+1:]
1044     if hashlib.sha224(doc_id).hexdigest() == doc_id_hash:
1045       return doc_id
1046     raise _InvalidCursorException('Invalid cursor string: ' + encoded_cursor)
1047
1048   def __repr__(self):
1049     return search_util.Repr(self, [('__indexes', self.__indexes)])
1050
1051   def Write(self):
1052     """Write search indexes to the index file.
1053
1054     This method is a no-op if index_file is set to None.
1055     """
1056     if not self.__index_file:
1057       return
1058
1059
1060
1061
1062
1063     descriptor, tmp_filename = tempfile.mkstemp(
1064         dir=os.path.dirname(self.__index_file))
1065     tmpfile = os.fdopen(descriptor, 'wb')
1066
1067     pickler = pickle.Pickler(tmpfile, protocol=1)
1068     pickler.fast = True
1069     pickler.dump((self._VERSION, self.__indexes))
1070
1071     tmpfile.close()
1072
1073     self.__index_file_lock.acquire()
1074     try:
1075       try:
1076
1077         os.rename(tmp_filename, self.__index_file)
1078       except OSError:
1079
1080
1081         os.remove(self.__index_file)
1082         os.rename(tmp_filename, self.__index_file)
1083     finally:
1084       self.__index_file_lock.release()
1085
1086   def _ReadFromFile(self):
1087     self.__index_file_lock.acquire()
1088     try:
1089       if os.path.isfile(self.__index_file):
1090         version, indexes = pickle.load(open(self.__index_file, 'rb'))
1091         if version == self._VERSION:
1092           return indexes
1093         logging.warning(
1094             'Saved search indexes are not compatible with this version of the '
1095             'SDK. Search indexes have been cleared.')
1096       else:
1097         logging.warning(
1098             'Could not read search indexes from %s', self.__index_file)
1099     except (AttributeError, LookupError, ImportError, NameError, TypeError,
1100             ValueError, pickle.PickleError, IOError), e:
1101       logging.warning(
1102           'Could not read indexes from %s. Try running with the '
1103           '--clear_search_index flag. Cause:\n%r' % (self.__index_file, e))
1104     finally:
1105       self.__index_file_lock.release()
1106
1107     return {}
1108
1109   def Read(self):
1110     """Read search indexes from the index file.
1111
1112     This method is a no-op if index_file is set to None.
1113     """
1114     if not self.__index_file:
1115       return
1116     read_indexes = self._ReadFromFile()
1117     if read_indexes:
1118       self.__indexes = read_indexes