App Engine Python SDK version 1.8.9
[gae.git] / python / google / appengine / api / search / simple_search_stub.py
blob5f5bf252590a4f98db42af40552ce4203f72ad30
1 #!/usr/bin/env python
3 # Copyright 2007 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
21 """Simple RAM backed Search API stub."""
32 import base64
33 import bisect
34 import copy
35 import cPickle as pickle
36 import datetime
37 import functools
38 import logging
39 import math
40 import os
41 import random
42 import string
43 import tempfile
44 import threading
45 import urllib
46 import uuid
48 from google.appengine.datastore import document_pb
49 from google.appengine.api import apiproxy_stub
50 from google.appengine.api.namespace_manager import namespace_manager
51 from google.appengine.api.search import query_parser
52 from google.appengine.api.search import QueryParser
53 from google.appengine.api.search import search
54 from google.appengine.api.search import search_service_pb
55 from google.appengine.api.search import search_util
56 from google.appengine.api.search.stub import document_matcher
57 from google.appengine.api.search.stub import expression_evaluator
58 from google.appengine.api.search.stub import simple_tokenizer
59 from google.appengine.api.search.stub import tokens
60 from google.appengine.runtime import apiproxy_errors
62 __all__ = ['IndexConsistencyError',
63 'Posting',
64 'PostingList',
65 'RamInvertedIndex',
66 'SearchServiceStub',
67 'SimpleIndex',
68 'FieldTypesDict',
71 _VISIBLE_PRINTABLE_ASCII = frozenset(
72 set(string.printable) - set(string.whitespace))
75 class IndexConsistencyError(Exception):
76 """Deprecated 1.7.7. Accessed index with same name different consistency."""
79 class Posting(object):
80 """Represents a occurrences of some token at positions in a document."""
82 def __init__(self, doc_id):
83 """Initializer.
85 Args:
86 doc_id: The identifier of the document with token occurrences.
88 Raises:
89 TypeError: If an unknown argument is passed.
90 """
91 self._doc_id = doc_id
92 self._positions = []
94 @property
95 def doc_id(self):
96 """Return id of the document that the token occurred in."""
97 return self._doc_id
99 def AddPosition(self, position):
100 """Adds the position in token sequence to occurrences for token."""
101 pos = bisect.bisect_left(self._positions, position)
102 if pos < len(self._positions) and self._positions[pos] == position:
103 return
104 self._positions.insert(pos, position)
106 def RemovePosition(self, position):
107 """Removes the position in token sequence from occurrences for token."""
108 pos = bisect.bisect_left(self._positions, position)
109 if pos < len(self._positions) and self._positions[pos] == position:
110 del self._positions[pos]
112 def __cmp__(self, other):
113 if not isinstance(other, Posting):
114 return -2
115 return cmp(self.doc_id, other.doc_id)
117 @property
118 def positions(self):
119 return self._positions
121 def __repr__(self):
122 return search_util.Repr(
123 self, [('doc_id', self.doc_id), ('positions', self.positions)])
126 class PostingList(object):
127 """Represents ordered positions of some token in document.
129 A PostingList consists of a document id and a sequence of positions
130 that the same token occurs in the document.
133 def __init__(self):
134 self._postings = []
136 def Add(self, doc_id, position):
137 """Adds the token position for the given doc_id."""
138 posting = Posting(doc_id=doc_id)
139 pos = bisect.bisect_left(self._postings, posting)
140 if pos < len(self._postings) and self._postings[
141 pos].doc_id == posting.doc_id:
142 posting = self._postings[pos]
143 else:
144 self._postings.insert(pos, posting)
145 posting.AddPosition(position)
147 def Remove(self, doc_id, position):
148 """Removes the token position for the given doc_id."""
149 posting = Posting(doc_id=doc_id)
150 pos = bisect.bisect_left(self._postings, posting)
151 if pos < len(self._postings) and self._postings[
152 pos].doc_id == posting.doc_id:
153 posting = self._postings[pos]
154 posting.RemovePosition(position)
155 if not posting.positions:
156 del self._postings[pos]
158 @property
159 def postings(self):
160 return self._postings
162 def __iter__(self):
163 return iter(self._postings)
165 def __repr__(self):
166 return search_util.Repr(self, [('postings', self.postings)])
169 class _ScoredDocument(object):
170 """A scored document_pb.Document."""
172 def __init__(self, document, score):
173 self._document = document
174 self._score = score
175 self._expressions = {}
177 @property
178 def document(self):
179 return self._document
181 @property
182 def score(self):
183 return self._score
185 @property
186 def expressions(self):
187 return self._expressions
189 def __repr__(self):
190 return search_util.Repr(
191 self, [('document', self.document), ('score', self.score)])
194 class _DocumentStatistics(object):
195 """Statistics about terms occuring in a document."""
197 def __init__(self):
198 self._term_stats = {}
200 def __iter__(self):
201 for item in self._term_stats.items():
202 yield item
204 def IncrementTermCount(self, term):
205 """Adds an occurrence of the term to the stats for the document."""
206 count = 0
207 if term in self._term_stats:
208 count = self._term_stats[term]
209 count += 1
210 self._term_stats[term] = count
212 def TermFrequency(self, term):
213 """Returns the term frequency in the document."""
214 if term not in self._term_stats:
215 return 0
216 return self._term_stats[term]
218 @property
219 def term_stats(self):
220 """Returns the collection of term frequencies in the document."""
221 return self._term_stats
223 def __eq__(self, other):
224 return self.term_stats == other.term_stats
226 def __hash__(self):
227 return hash(self.term_stats)
229 def __repr__(self):
230 return search_util.Repr(self, [('term_stats', self.term_stats)])
233 class FieldTypesDict(object):
234 """Dictionary-like object for type mappings."""
236 def __init__(self):
237 self._field_types = []
239 def __contains__(self, name):
240 return name in [ f.name() for f in self._field_types ]
242 def __getitem__(self, name):
243 for f in self._field_types:
244 if name == f.name():
245 return f
246 raise KeyError, name
248 def AddFieldType(self, name, field_type):
249 field_types = None
250 for f in self._field_types:
251 if name == f.name():
252 field_types = f
253 if field_types is None:
254 field_types = document_pb.FieldTypes()
255 field_types.set_name(name)
256 self._field_types.append(field_types)
257 if field_type not in field_types.type_list():
258 field_types.add_type(field_type)
260 def __iter__(self):
261 return iter(sorted([f.name() for f in self._field_types]))
263 def __repr__(self):
264 return repr(self._field_types)
266 class RamInvertedIndex(object):
267 """A simple RAM-resident inverted file over documents."""
269 def __init__(self, tokenizer):
270 self._tokenizer = tokenizer
271 self._inverted_index = {}
272 self._schema = FieldTypesDict()
273 self._document_ids = set([])
275 def _AddDocumentId(self, doc_id):
276 """Adds the doc_id to set in index."""
277 self._document_ids.add(doc_id)
279 def _RemoveDocumentId(self, doc_id):
280 """Removes the doc_id from the set in index."""
281 if doc_id in self._document_ids:
282 self._document_ids.remove(doc_id)
284 @property
285 def document_count(self):
286 return len(self._document_ids)
288 def _AddFieldType(self, name, field_type):
289 """Adds the type to the list supported for a named field."""
290 self._schema.AddFieldType(name, field_type)
292 def GetDocumentStats(self, document):
293 """Gets statistics about occurrences of terms in document."""
294 document_stats = _DocumentStatistics()
295 for field in document.field_list():
296 for token in self._tokenizer.TokenizeValue(field_value=field.value()):
297 document_stats.IncrementTermCount(token.chars)
298 return document_stats
300 def AddDocument(self, doc_id, document):
301 """Adds a document into the index."""
302 token_position = 0
303 for field in document.field_list():
304 self._AddFieldType(field.name(), field.value().type())
305 self._AddTokens(doc_id, field.name(), field.value(), token_position)
306 self._AddDocumentId(doc_id)
308 def RemoveDocument(self, document):
309 """Removes a document from the index."""
310 doc_id = document.id()
311 for field in document.field_list():
312 self._RemoveTokens(doc_id, field.name(), field.value())
313 self._RemoveDocumentId(doc_id)
315 def _AddTokens(self, doc_id, field_name, field_value, token_position):
316 """Adds token occurrences for a given doc's field value."""
317 for token in self._tokenizer.TokenizeValue(field_value, token_position):
318 self._AddToken(doc_id, token)
319 self._AddToken(doc_id, token.RestrictField(field_name))
321 def _RemoveTokens(self, doc_id, field_name, field_value):
322 """Removes tokens occurrences for a given doc's field value."""
323 for token in self._tokenizer.TokenizeValue(field_value=field_value):
324 self._RemoveToken(doc_id, token)
325 self._RemoveToken(doc_id, token.RestrictField(field_name))
327 def _AddToken(self, doc_id, token):
328 """Adds a token occurrence for a document."""
329 postings = self._inverted_index.get(token)
330 if postings is None:
331 self._inverted_index[token] = postings = PostingList()
332 postings.Add(doc_id, token.position)
334 def _RemoveToken(self, doc_id, token):
335 """Removes a token occurrence for a document."""
336 if token in self._inverted_index:
337 postings = self._inverted_index[token]
338 postings.Remove(doc_id, token.position)
339 if not postings.postings:
340 del self._inverted_index[token]
342 def GetPostingsForToken(self, token):
343 """Returns all document postings which for the token."""
344 if token in self._inverted_index:
345 return self._inverted_index[token].postings
346 return []
348 def GetSchema(self):
349 """Returns the schema for the index."""
350 return self._schema
352 def __repr__(self):
353 return search_util.Repr(self, [('_inverted_index', self._inverted_index),
354 ('_schema', self._schema),
355 ('document_count', self.document_count)])
358 def _ScoreRequested(params):
359 """Returns True if match scoring requested, False otherwise."""
360 return params.has_scorer_spec() and params.scorer_spec().has_scorer()
363 class SimpleIndex(object):
364 """A simple search service which uses a RAM-resident inverted file."""
366 def __init__(self, index_spec):
367 self._index_spec = index_spec
368 self._documents = {}
369 self._parser = simple_tokenizer.SimpleTokenizer(split_restricts=False)
370 self._inverted_index = RamInvertedIndex(simple_tokenizer.SimpleTokenizer())
372 @property
373 def index_spec(self):
374 """Returns the index specification for the index."""
375 return self._index_spec
377 def IndexDocuments(self, documents, response):
378 """Indexes an iterable DocumentPb.Document."""
379 for document in documents:
380 doc_id = document.id()
381 if not doc_id:
382 doc_id = str(uuid.uuid4())
383 document.set_id(doc_id)
391 try:
392 search._NewDocumentFromPb(document)
393 except ValueError, e:
394 new_status = response.add_status()
395 new_status.set_code(search_service_pb.SearchServiceError.INVALID_REQUEST)
396 new_status.set_error_detail(e.message)
397 continue
398 response.add_doc_id(doc_id)
399 if doc_id in self._documents:
400 old_document = self._documents[doc_id]
401 self._inverted_index.RemoveDocument(old_document)
402 self._documents[doc_id] = document
403 new_status = response.add_status()
404 new_status.set_code(search_service_pb.SearchServiceError.OK)
405 self._inverted_index.AddDocument(doc_id, document)
407 def DeleteDocuments(self, document_ids, response):
408 """Deletes documents for the given document_ids."""
409 for document_id in document_ids:
410 if document_id in self._documents:
411 document = self._documents[document_id]
412 self._inverted_index.RemoveDocument(document)
413 del self._documents[document_id]
414 delete_status = response.add_status()
415 delete_status.set_code(search_service_pb.SearchServiceError.OK)
417 def Documents(self):
418 """Returns the documents in the index."""
419 return self._documents.values()
421 def _TermFrequency(self, term, document):
422 """Return the term frequency in the document."""
423 return self._inverted_index.GetDocumentStats(document).TermFrequency(term)
425 @property
426 def document_count(self):
427 """Returns the count of documents in the index."""
428 return self._inverted_index.document_count
430 def _DocumentCountForTerm(self, term):
431 """Returns the document count for documents containing the term."""
432 return len(self._PostingsForToken(tokens.Token(chars=term)))
434 def _InverseDocumentFrequency(self, term):
435 """Returns inverse document frequency of term."""
436 doc_count = self._DocumentCountForTerm(term)
437 if doc_count:
438 return math.log10(self.document_count / float(doc_count))
439 else:
440 return 0
442 def _TermFrequencyInverseDocumentFrequency(self, term, document):
443 """Returns the term frequency times inverse document frequency of term."""
444 return (self._TermFrequency(term, document) *
445 self._InverseDocumentFrequency(term))
447 def _ScoreDocument(self, document, score, terms):
448 """Scores a document for the given query."""
449 if not score:
450 return 0
451 tf_idf = 0
452 for term in terms:
453 tf_idf += self._TermFrequencyInverseDocumentFrequency(term, document)
454 return tf_idf
456 def _PostingsForToken(self, token):
457 """Returns the postings for the token."""
458 return self._inverted_index.GetPostingsForToken(token)
460 def _CollectTerms(self, node):
461 """Get all search terms for scoring."""
462 if node.getType() in search_util.TEXT_QUERY_TYPES:
463 return set([query_parser.GetQueryNodeText(node).strip('"')])
464 elif node.children:
465 if node.getType() == QueryParser.EQ and len(node.children) > 1:
466 children = node.children[1:]
467 else:
468 children = node.children
470 result = set()
471 for term_set in (self._CollectTerms(child) for child in children):
472 result.update(term_set)
473 return result
474 return set()
476 def _CollectFields(self, node):
477 if node.getType() == QueryParser.EQ and node.children:
478 return set([query_parser.GetQueryNodeText(node.children[0])])
479 elif node.children:
480 result = set()
481 for term_set in (self._CollectFields(child) for child in node.children):
482 result.update(term_set)
483 return result
484 return set()
486 def _Evaluate(self, node, score=True):
487 """Retrieve scored results for a search query."""
488 doc_match = document_matcher.DocumentMatcher(node, self._inverted_index)
490 matched_documents = doc_match.FilterDocuments(self._documents.itervalues())
491 terms = self._CollectTerms(node)
492 scored_documents = [
493 _ScoredDocument(doc, self._ScoreDocument(doc, score, terms))
494 for doc in matched_documents]
495 return scored_documents
497 def _Sort(self, docs, search_params, score):
498 if score:
499 return sorted(docs, key=lambda doc: doc.score, reverse=True)
501 if not search_params.sort_spec_size():
502 return sorted(docs, key=lambda doc: doc.document.order_id(), reverse=True)
504 def SortKey(scored_doc):
505 """Return the sort key for a document based on the request parameters.
507 Arguments:
508 scored_doc: The document to score
510 Returns:
511 The sort key of a document. The sort key is a tuple, where the nth
512 element in the tuple corresponds to the value of the nth sort expression
513 evaluated on the document.
515 Raises:
516 Exception: if no default value is specified.
518 expr_vals = []
519 for sort_spec in search_params.sort_spec_list():
520 if not (sort_spec.has_default_value_text() or
521 sort_spec.has_default_value_numeric()):
522 raise Exception('A default value must be specified for sorting.')
523 elif sort_spec.has_default_value_text():
524 default_value = sort_spec.default_value_text()
525 else:
526 default_value = sort_spec.default_value_numeric()
527 val = expression_evaluator.ExpressionEvaluator(
528 scored_doc, self._inverted_index, True).ValueOf(
529 sort_spec.sort_expression(), default_value=default_value)
530 if isinstance(val, datetime.datetime):
531 val = search_util.EpochTime(val)
532 expr_vals.append(val)
533 return tuple(expr_vals)
535 def SortCmp(x, y):
536 """The comparison function for sort keys."""
539 for i, val_tuple in enumerate(zip(x, y)):
540 cmp_val = cmp(*val_tuple)
541 if cmp_val:
542 if search_params.sort_spec(i).sort_descending():
543 return -cmp_val
544 return cmp_val
545 return 0
546 return sorted(docs, key=SortKey, cmp=SortCmp)
548 def _AttachExpressions(self, docs, search_params):
549 if search_params.has_field_spec():
550 for doc in docs:
551 evaluator = expression_evaluator.ExpressionEvaluator(
552 doc, self._inverted_index)
553 for expr in search_params.field_spec().expression_list():
554 evaluator.Evaluate(expr)
555 return docs
557 def Search(self, search_request):
558 """Searches the simple index for ."""
559 query = urllib.unquote(search_request.query())
560 query = query.strip()
561 score = _ScoreRequested(search_request)
562 if not query:
563 docs = [_ScoredDocument(doc, 0.0) for doc in self._documents.values()]
564 else:
565 if not isinstance(query, unicode):
566 query = unicode(query, 'utf-8')
567 query_tree = query_parser.ParseAndSimplify(query)
568 docs = self._Evaluate(query_tree, score=score)
569 docs = self._Sort(docs, search_request, score)
570 docs = self._AttachExpressions(docs, search_request)
571 return docs
573 def GetSchema(self):
574 """Returns the schema for the index."""
575 return self._inverted_index.GetSchema()
577 def __repr__(self):
578 return search_util.Repr(self, [('_index_spec', self._index_spec),
579 ('_documents', self._documents),
580 ('_inverted_index', self._inverted_index)])
583 class SearchServiceStub(apiproxy_stub.APIProxyStub):
584 """Simple RAM backed Search service stub.
586 This stub provides the search_service_pb.SearchService. But this is
587 NOT a subclass of SearchService itself. Services are provided by
588 the methods prefixed by "_Dynamic_".
594 _VERSION = 1
596 def __init__(self, service_name='search', index_file=None):
597 """Constructor.
599 Args:
600 service_name: Service name expected for all calls.
601 index_file: The file to which search indexes will be persisted.
603 self.__indexes = {}
604 self.__index_file = index_file
605 self.__index_file_lock = threading.Lock()
606 super(SearchServiceStub, self).__init__(service_name)
608 self.Read()
610 def _InvalidRequest(self, status, exception):
611 status.set_code(search_service_pb.SearchServiceError.INVALID_REQUEST)
612 status.set_error_detail(exception.message)
614 def _UnknownIndex(self, status, index_spec):
615 status.set_code(search_service_pb.SearchServiceError.OK)
616 status.set_error_detail('no index for %r' % index_spec)
618 def _GetNamespace(self, namespace):
619 """Get namespace name.
621 Args:
622 namespace: Namespace provided in request arguments.
624 Returns:
625 If namespace is None, returns the name of the current global namespace. If
626 namespace is not None, returns namespace.
628 if namespace is not None:
629 return namespace
630 return namespace_manager.get_namespace()
632 def _GetIndex(self, index_spec, create=False):
633 namespace = self._GetNamespace(index_spec.namespace())
635 index = self.__indexes.setdefault(namespace, {}).get(index_spec.name())
636 if index is None:
637 if create:
638 index = SimpleIndex(index_spec)
639 self.__indexes[namespace][index_spec.name()] = index
640 else:
641 return None
642 return index
644 def _Dynamic_IndexDocument(self, request, response):
645 """A local implementation of SearchService.IndexDocument RPC.
647 Index a new document or update an existing document.
649 Args:
650 request: A search_service_pb.IndexDocumentRequest.
651 response: An search_service_pb.IndexDocumentResponse.
653 params = request.params()
654 index = self._GetIndex(params.index_spec(), create=True)
655 index.IndexDocuments(params.document_list(), response)
657 def _Dynamic_DeleteDocument(self, request, response):
658 """A local implementation of SearchService.DeleteDocument RPC.
660 Args:
661 request: A search_service_pb.DeleteDocumentRequest.
662 response: An search_service_pb.DeleteDocumentResponse.
664 params = request.params()
665 index_spec = params.index_spec()
666 index = self._GetIndex(index_spec)
667 if index is None:
668 self._UnknownIndex(response.add_status(), index_spec)
669 return
670 index.DeleteDocuments(params.doc_id_list(), response)
672 def _Dynamic_ListIndexes(self, request, response):
673 """A local implementation of SearchService.ListIndexes RPC.
675 Args:
676 request: A search_service_pb.ListIndexesRequest.
677 response: An search_service_pb.ListIndexesResponse.
679 Raises:
680 ResponseTooLargeError: raised for testing admin console.
685 if request.has_app_id():
686 if random.choice([True] + [False] * 9):
687 raise apiproxy_errors.ResponseTooLargeError()
689 for _ in xrange(random.randint(0, 2) * random.randint(5, 15)):
690 new_index_spec = response.add_index_metadata().mutable_index_spec()
691 new_index_spec.set_name(
692 random.choice(list(_VISIBLE_PRINTABLE_ASCII - set('!'))) +
693 ''.join(random.choice(list(_VISIBLE_PRINTABLE_ASCII))
694 for _ in xrange(random.randint(
695 0, search.MAXIMUM_INDEX_NAME_LENGTH))))
696 response.mutable_status().set_code(
697 random.choice([search_service_pb.SearchServiceError.OK] * 10 +
698 [search_service_pb.SearchServiceError.TRANSIENT_ERROR] +
699 [search_service_pb.SearchServiceError.INTERNAL_ERROR]))
700 return
702 response.mutable_status().set_code(
703 search_service_pb.SearchServiceError.OK)
705 namespace = self._GetNamespace(request.params().namespace())
706 if namespace not in self.__indexes or not self.__indexes[namespace]:
707 return
709 keys, indexes = zip(*sorted(
710 self.__indexes[namespace].iteritems(), key=lambda v: v[0]))
711 position = 0
712 params = request.params()
713 if params.has_start_index_name():
714 position = bisect.bisect_left(keys, params.start_index_name())
715 if (not params.include_start_index() and position < len(keys)
716 and keys[position] == params.start_index_name()):
717 position += 1
718 elif params.has_index_name_prefix():
719 position = bisect.bisect_left(keys, params.index_name_prefix())
720 if params.has_offset():
721 position += params.offset()
722 end_position = position + params.limit()
723 prefix = params.index_name_prefix()
724 for index in indexes[min(position, len(keys)):min(end_position, len(keys))]:
725 index_spec = index.index_spec
726 if prefix and not index_spec.name().startswith(prefix):
727 break
728 metadata = response.add_index_metadata()
729 new_index_spec = metadata.mutable_index_spec()
730 new_index_spec.set_name(index_spec.name())
731 new_index_spec.set_namespace(index_spec.namespace())
732 if params.fetch_schema():
733 self._AddSchemaInformation(index, metadata)
735 def _AddSchemaInformation(self, index, metadata_pb):
736 schema = index.GetSchema()
737 for name in schema:
738 field_types = schema[name]
739 new_field_types = metadata_pb.add_field()
740 new_field_types.MergeFrom(field_types)
742 def _AddDocument(self, response, document, ids_only):
743 doc = response.add_document()
744 if ids_only:
745 doc.set_id(document.id())
746 else:
747 doc.MergeFrom(document)
749 def _Dynamic_ListDocuments(self, request, response):
750 """A local implementation of SearchService.ListDocuments RPC.
752 Args:
753 request: A search_service_pb.ListDocumentsRequest.
754 response: An search_service_pb.ListDocumentsResponse.
756 params = request.params()
757 index = self._GetIndex(params.index_spec(), create=True)
758 if index is None:
759 self._UnknownIndex(response.mutable_status(), params.index_spec())
760 return
762 num_docs = 0
763 start = not params.has_start_doc_id()
764 for document in sorted(index.Documents(), key=lambda doc: doc.id()):
765 if start:
766 if num_docs < params.limit():
767 self._AddDocument(response, document, params.keys_only())
768 num_docs += 1
769 else:
770 if document.id() >= params.start_doc_id():
771 start = True
772 if (document.id() != params.start_doc_id() or
773 params.include_start_doc()):
774 self._AddDocument(response, document, params.keys_only())
775 num_docs += 1
777 response.mutable_status().set_code(
778 search_service_pb.SearchServiceError.OK)
780 def _RandomSearchResponse(self, request, response):
782 random.seed()
783 if random.random() < 0.03:
784 raise apiproxy_errors.ResponseTooLargeError()
785 response.mutable_status().set_code(
786 random.choice([search_service_pb.SearchServiceError.OK] * 30 +
787 [search_service_pb.SearchServiceError.TRANSIENT_ERROR] +
788 [search_service_pb.SearchServiceError.INTERNAL_ERROR]))
790 params = request.params()
791 random.seed(params.query())
792 total = random.randint(0, 100)
795 if random.random() < 0.3:
796 total = 0
798 offset = 0
799 if params.has_offset():
800 offset = params.offset()
802 remaining = max(0, total - offset)
803 nresults = min(remaining, params.limit())
804 matched_count = offset + nresults
805 if remaining > nresults:
806 matched_count += random.randint(1, 100)
808 def RandomText(charset, min_len, max_len):
809 return ''.join(random.choice(charset)
810 for _ in xrange(random.randint(min_len, max_len)))
812 for i in xrange(nresults):
813 seed = '%s:%s' % (params.query(), i + offset)
814 random.seed(seed)
815 result = response.add_result()
816 doc = result.mutable_document()
817 doc_id = RandomText(string.letters + string.digits, 8, 10)
818 doc.set_id(doc_id)
819 random.seed(doc_id)
820 for _ in params.sort_spec_list():
821 result.add_score(random.random())
823 for name, probability in [('creator', 0.90), ('last_change', 0.40)]:
824 if random.random() < probability:
825 field = doc.add_field()
826 field.set_name(name)
827 value = field.mutable_value()
828 value.set_type(document_pb.FieldValue.TEXT)
829 value.set_string_value(
830 RandomText(string.letters + string.digits, 2, 10)
831 + '@google.com')
833 field = doc.add_field()
834 field.set_name('content')
835 value = field.mutable_value()
836 value.set_type(document_pb.FieldValue.TEXT)
837 value.set_string_value(
838 RandomText(string.printable, 0, 15) + params.query() +
839 RandomText(string.printable + 10 * string.whitespace, 5, 5000))
841 for i in xrange(random.randint(0, 2)):
842 field = doc.add_field()
843 field.set_name(RandomText(string.letters, 3, 7))
844 value = field.mutable_value()
845 value.set_type(document_pb.FieldValue.TEXT)
846 value.set_string_value(RandomText(string.printable, 0, 100))
848 response.set_matched_count(matched_count)
850 def _DefaultFillSearchResponse(self, params, results, response):
851 """Fills the SearchResponse with the first set of results."""
852 position_range = range(0, min(params.limit(), len(results)))
853 self._FillSearchResponse(results, position_range, params.cursor_type(),
854 _ScoreRequested(params), response)
856 def _CopyDocument(self, doc, doc_copy, field_names, ids_only=None):
857 """Copies Document, doc, to doc_copy restricting fields to field_names."""
858 doc_copy.set_id(doc.id())
859 if ids_only:
860 return
861 if doc.has_language():
862 doc_copy.set_language(doc.language())
863 for field in doc.field_list():
864 if not field_names or field.name() in field_names:
865 doc_copy.add_field().CopyFrom(field)
866 doc_copy.set_order_id(doc.order_id())
868 def _FillSearchResponse(self, results, position_range, cursor_type, score,
869 response, field_names=None, ids_only=None):
870 """Fills the SearchResponse with a selection of results."""
871 for i in position_range:
872 result = results[i]
873 search_result = response.add_result()
874 self._CopyDocument(result.document, search_result.mutable_document(),
875 field_names, ids_only)
876 if cursor_type == search_service_pb.SearchParams.PER_RESULT:
877 search_result.set_cursor(self._EncodeCursor(result.document))
878 if score:
879 search_result.add_score(result.score)
880 for field, expression in result.expressions.iteritems():
881 expr = search_result.add_expression()
882 expr.set_name(field)
883 if (isinstance(expression, float) or
884 isinstance(expression, long) or
885 isinstance(expression, int)):
886 expr.mutable_value().set_string_value(str(expression))
887 expr.mutable_value().set_type(document_pb.FieldValue.NUMBER)
888 else:
889 expr.mutable_value().set_string_value(expression)
890 expr.mutable_value().set_type(document_pb.FieldValue.HTML)
892 def _Dynamic_Search(self, request, response):
893 """A local implementation of SearchService.Search RPC.
895 Args:
896 request: A search_service_pb.SearchRequest.
897 response: An search_service_pb.SearchResponse.
899 if request.has_app_id():
900 self._RandomSearchResponse(request, response)
901 return
903 index = None
904 index = self._GetIndex(request.params().index_spec())
905 if index is None:
906 self._UnknownIndex(response.mutable_status(),
907 request.params().index_spec())
908 response.set_matched_count(0)
909 return
911 params = request.params()
912 try:
913 results = index.Search(params)
914 except query_parser.QueryException, e:
915 self._InvalidRequest(response.mutable_status(), e)
916 response.set_matched_count(0)
917 return
918 except document_matcher.ExpressionTreeException, e:
919 self._InvalidRequest(response.mutable_status(), e)
920 response.set_matched_count(0)
921 return
922 response.set_matched_count(len(results))
924 offset = 0
925 if params.has_cursor():
926 doc_id = self._DecodeCursor(params.cursor())
927 for i, result in enumerate(results):
928 if result.document.id() == doc_id:
929 offset = i + 1
930 break
931 elif params.has_offset():
932 offset = params.offset()
936 if offset < len(results):
939 limit = offset + params.limit()
940 if limit >= len(results):
943 range_end = len(results)
944 else:
948 range_end = limit
949 if params.cursor_type() == search_service_pb.SearchParams.SINGLE:
950 document = results[range_end - 1].document
951 response.set_cursor(self._EncodeCursor(document))
952 result_range = range(offset, range_end)
953 else:
954 result_range = range(0)
955 field_names = params.field_spec().name_list()
956 self._FillSearchResponse(results, result_range, params.cursor_type(),
957 _ScoreRequested(params), response, field_names,
958 params.keys_only())
960 response.mutable_status().set_code(search_service_pb.SearchServiceError.OK)
962 def _EncodeCursor(self, document):
963 return base64.urlsafe_b64encode(document.id())
965 def _DecodeCursor(self, cursor):
966 return base64.urlsafe_b64decode(cursor)
968 def __repr__(self):
969 return search_util.Repr(self, [('__indexes', self.__indexes)])
971 def Write(self):
972 """Write search indexes to the index file.
974 This method is a no-op if index_file is set to None.
976 if not self.__index_file:
977 return
983 descriptor, tmp_filename = tempfile.mkstemp(
984 dir=os.path.dirname(self.__index_file))
985 tmpfile = os.fdopen(descriptor, 'wb')
987 pickler = pickle.Pickler(tmpfile, protocol=1)
988 pickler.fast = True
989 pickler.dump((self._VERSION, self.__indexes))
991 tmpfile.close()
993 self.__index_file_lock.acquire()
994 try:
995 try:
997 os.rename(tmp_filename, self.__index_file)
998 except OSError:
1001 os.remove(self.__index_file)
1002 os.rename(tmp_filename, self.__index_file)
1003 finally:
1004 self.__index_file_lock.release()
1006 def _ReadFromFile(self):
1007 self.__index_file_lock.acquire()
1008 try:
1009 if os.path.isfile(self.__index_file):
1010 version, indexes = pickle.load(open(self.__index_file, 'rb'))
1011 if version == self._VERSION:
1012 return indexes
1013 logging.warning(
1014 'Saved search indexes are not compatible with this version of the '
1015 'SDK. Search indexes have been cleared.')
1016 else:
1017 logging.warning(
1018 'Could not read search indexes from %s', self.__index_file)
1019 except (AttributeError, LookupError, ImportError, NameError, TypeError,
1020 ValueError, pickle.PickleError, IOError), e:
1021 logging.warning(
1022 'Could not read indexes from %s. Try running with the '
1023 '--clear_search_index flag. Cause:\n%r' % (self.__index_file, e))
1024 finally:
1025 self.__index_file_lock.release()
1027 return {}
1029 def Read(self):
1030 """Read search indexes from the index file.
1032 This method is a no-op if index_file is set to None.
1034 if not self.__index_file:
1035 return
1036 read_indexes = self._ReadFromFile()
1037 if read_indexes:
1038 self.__indexes = read_indexes