3 # Copyright 2007 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
21 """Simple RAM backed Search API stub."""
35 import cPickle
as pickle
48 from google
.appengine
.datastore
import document_pb
49 from google
.appengine
.api
import apiproxy_stub
50 from google
.appengine
.api
.namespace_manager
import namespace_manager
51 from google
.appengine
.api
.search
import query_parser
52 from google
.appengine
.api
.search
import QueryParser
53 from google
.appengine
.api
.search
import search
54 from google
.appengine
.api
.search
import search_service_pb
55 from google
.appengine
.api
.search
import search_util
56 from google
.appengine
.api
.search
.stub
import document_matcher
57 from google
.appengine
.api
.search
.stub
import expression_evaluator
58 from google
.appengine
.api
.search
.stub
import simple_tokenizer
59 from google
.appengine
.api
.search
.stub
import tokens
60 from google
.appengine
.runtime
import apiproxy_errors
62 __all__
= ['IndexConsistencyError',
71 _VISIBLE_PRINTABLE_ASCII
= frozenset(
72 set(string
.printable
) - set(string
.whitespace
))
75 class IndexConsistencyError(Exception):
76 """Deprecated 1.7.7. Accessed index with same name different consistency."""
79 class Posting(object):
80 """Represents a occurrences of some token at positions in a document."""
82 def __init__(self
, doc_id
):
86 doc_id: The identifier of the document with token occurrences.
89 TypeError: If an unknown argument is passed.
96 """Return id of the document that the token occurred in."""
99 def AddPosition(self
, position
):
100 """Adds the position in token sequence to occurrences for token."""
101 pos
= bisect
.bisect_left(self
._positions
, position
)
102 if pos
< len(self
._positions
) and self
._positions
[pos
] == position
:
104 self
._positions
.insert(pos
, position
)
106 def RemovePosition(self
, position
):
107 """Removes the position in token sequence from occurrences for token."""
108 pos
= bisect
.bisect_left(self
._positions
, position
)
109 if pos
< len(self
._positions
) and self
._positions
[pos
] == position
:
110 del self
._positions
[pos
]
112 def __cmp__(self
, other
):
113 if not isinstance(other
, Posting
):
115 return cmp(self
.doc_id
, other
.doc_id
)
119 return self
._positions
122 return search_util
.Repr(
123 self
, [('doc_id', self
.doc_id
), ('positions', self
.positions
)])
126 class PostingList(object):
127 """Represents ordered positions of some token in document.
129 A PostingList consists of a document id and a sequence of positions
130 that the same token occurs in the document.
136 def Add(self
, doc_id
, position
):
137 """Adds the token position for the given doc_id."""
138 posting
= Posting(doc_id
=doc_id
)
139 pos
= bisect
.bisect_left(self
._postings
, posting
)
140 if pos
< len(self
._postings
) and self
._postings
[
141 pos
].doc_id
== posting
.doc_id
:
142 posting
= self
._postings
[pos
]
144 self
._postings
.insert(pos
, posting
)
145 posting
.AddPosition(position
)
147 def Remove(self
, doc_id
, position
):
148 """Removes the token position for the given doc_id."""
149 posting
= Posting(doc_id
=doc_id
)
150 pos
= bisect
.bisect_left(self
._postings
, posting
)
151 if pos
< len(self
._postings
) and self
._postings
[
152 pos
].doc_id
== posting
.doc_id
:
153 posting
= self
._postings
[pos
]
154 posting
.RemovePosition(position
)
155 if not posting
.positions
:
156 del self
._postings
[pos
]
160 return self
._postings
163 return iter(self
._postings
)
166 return search_util
.Repr(self
, [('postings', self
.postings
)])
169 class _ScoredDocument(object):
170 """A scored document_pb.Document."""
172 def __init__(self
, document
, score
):
173 self
._document
= document
175 self
._expressions
= {}
179 return self
._document
186 def expressions(self
):
187 return self
._expressions
190 return search_util
.Repr(
191 self
, [('document', self
.document
), ('score', self
.score
)])
194 class _DocumentStatistics(object):
195 """Statistics about terms occuring in a document."""
198 self
._term
_stats
= {}
201 for item
in self
._term
_stats
.items():
204 def IncrementTermCount(self
, term
):
205 """Adds an occurrence of the term to the stats for the document."""
207 if term
in self
._term
_stats
:
208 count
= self
._term
_stats
[term
]
210 self
._term
_stats
[term
] = count
212 def TermFrequency(self
, term
):
213 """Returns the term frequency in the document."""
214 if term
not in self
._term
_stats
:
216 return self
._term
_stats
[term
]
219 def term_stats(self
):
220 """Returns the collection of term frequencies in the document."""
221 return self
._term
_stats
223 def __eq__(self
, other
):
224 return self
.term_stats
== other
.term_stats
227 return hash(self
.term_stats
)
230 return search_util
.Repr(self
, [('term_stats', self
.term_stats
)])
233 class FieldTypesDict(object):
234 """Dictionary-like object for type mappings."""
237 self
._field
_types
= []
239 def __contains__(self
, name
):
240 return name
in [ f
.name() for f
in self
._field
_types
]
242 def __getitem__(self
, name
):
243 for f
in self
._field
_types
:
248 def AddFieldType(self
, name
, field_type
):
250 for f
in self
._field
_types
:
253 if field_types
is None:
254 field_types
= document_pb
.FieldTypes()
255 field_types
.set_name(name
)
256 self
._field
_types
.append(field_types
)
257 if field_type
not in field_types
.type_list():
258 field_types
.add_type(field_type
)
261 return iter(sorted([f
.name() for f
in self
._field
_types
]))
264 return repr(self
._field
_types
)
266 class RamInvertedIndex(object):
267 """A simple RAM-resident inverted file over documents."""
269 def __init__(self
, tokenizer
):
270 self
._tokenizer
= tokenizer
271 self
._inverted
_index
= {}
272 self
._schema
= FieldTypesDict()
273 self
._document
_ids
= set([])
275 def _AddDocumentId(self
, doc_id
):
276 """Adds the doc_id to set in index."""
277 self
._document
_ids
.add(doc_id
)
279 def _RemoveDocumentId(self
, doc_id
):
280 """Removes the doc_id from the set in index."""
281 if doc_id
in self
._document
_ids
:
282 self
._document
_ids
.remove(doc_id
)
285 def document_count(self
):
286 return len(self
._document
_ids
)
288 def _AddFieldType(self
, name
, field_type
):
289 """Adds the type to the list supported for a named field."""
290 self
._schema
.AddFieldType(name
, field_type
)
292 def GetDocumentStats(self
, document
):
293 """Gets statistics about occurrences of terms in document."""
294 document_stats
= _DocumentStatistics()
295 for field
in document
.field_list():
296 for token
in self
._tokenizer
.TokenizeValue(field_value
=field
.value()):
297 document_stats
.IncrementTermCount(token
.chars
)
298 return document_stats
300 def AddDocument(self
, doc_id
, document
):
301 """Adds a document into the index."""
303 for field
in document
.field_list():
304 self
._AddFieldType
(field
.name(), field
.value().type())
305 self
._AddTokens
(doc_id
, field
.name(), field
.value(), token_position
)
306 self
._AddDocumentId
(doc_id
)
308 def RemoveDocument(self
, document
):
309 """Removes a document from the index."""
310 doc_id
= document
.id()
311 for field
in document
.field_list():
312 self
._RemoveTokens
(doc_id
, field
.name(), field
.value())
313 self
._RemoveDocumentId
(doc_id
)
315 def _AddTokens(self
, doc_id
, field_name
, field_value
, token_position
):
316 """Adds token occurrences for a given doc's field value."""
317 for token
in self
._tokenizer
.TokenizeValue(field_value
, token_position
):
318 self
._AddToken
(doc_id
, token
)
319 self
._AddToken
(doc_id
, token
.RestrictField(field_name
))
321 def _RemoveTokens(self
, doc_id
, field_name
, field_value
):
322 """Removes tokens occurrences for a given doc's field value."""
323 for token
in self
._tokenizer
.TokenizeValue(field_value
=field_value
):
324 self
._RemoveToken
(doc_id
, token
)
325 self
._RemoveToken
(doc_id
, token
.RestrictField(field_name
))
327 def _AddToken(self
, doc_id
, token
):
328 """Adds a token occurrence for a document."""
329 postings
= self
._inverted
_index
.get(token
)
331 self
._inverted
_index
[token
] = postings
= PostingList()
332 postings
.Add(doc_id
, token
.position
)
334 def _RemoveToken(self
, doc_id
, token
):
335 """Removes a token occurrence for a document."""
336 if token
in self
._inverted
_index
:
337 postings
= self
._inverted
_index
[token
]
338 postings
.Remove(doc_id
, token
.position
)
339 if not postings
.postings
:
340 del self
._inverted
_index
[token
]
342 def GetPostingsForToken(self
, token
):
343 """Returns all document postings which for the token."""
344 if token
in self
._inverted
_index
:
345 return self
._inverted
_index
[token
].postings
349 """Returns the schema for the index."""
353 return search_util
.Repr(self
, [('_inverted_index', self
._inverted
_index
),
354 ('_schema', self
._schema
),
355 ('document_count', self
.document_count
)])
358 def _ScoreRequested(params
):
359 """Returns True if match scoring requested, False otherwise."""
360 return params
.has_scorer_spec() and params
.scorer_spec().has_scorer()
363 class SimpleIndex(object):
364 """A simple search service which uses a RAM-resident inverted file."""
366 def __init__(self
, index_spec
):
367 self
._index
_spec
= index_spec
369 self
._parser
= simple_tokenizer
.SimpleTokenizer(split_restricts
=False)
370 self
._inverted
_index
= RamInvertedIndex(simple_tokenizer
.SimpleTokenizer())
373 def index_spec(self
):
374 """Returns the index specification for the index."""
375 return self
._index
_spec
377 def IndexDocuments(self
, documents
, response
):
378 """Indexes an iterable DocumentPb.Document."""
379 for document
in documents
:
380 doc_id
= document
.id()
382 doc_id
= str(uuid
.uuid4())
383 document
.set_id(doc_id
)
392 search
._NewDocumentFromPb
(document
)
393 except ValueError, e
:
394 new_status
= response
.add_status()
395 new_status
.set_code(search_service_pb
.SearchServiceError
.INVALID_REQUEST
)
396 new_status
.set_error_detail(e
.message
)
398 response
.add_doc_id(doc_id
)
399 if doc_id
in self
._documents
:
400 old_document
= self
._documents
[doc_id
]
401 self
._inverted
_index
.RemoveDocument(old_document
)
402 self
._documents
[doc_id
] = document
403 new_status
= response
.add_status()
404 new_status
.set_code(search_service_pb
.SearchServiceError
.OK
)
405 self
._inverted
_index
.AddDocument(doc_id
, document
)
407 def DeleteDocuments(self
, document_ids
, response
):
408 """Deletes documents for the given document_ids."""
409 for document_id
in document_ids
:
410 if document_id
in self
._documents
:
411 document
= self
._documents
[document_id
]
412 self
._inverted
_index
.RemoveDocument(document
)
413 del self
._documents
[document_id
]
414 delete_status
= response
.add_status()
415 delete_status
.set_code(search_service_pb
.SearchServiceError
.OK
)
418 """Returns the documents in the index."""
419 return self
._documents
.values()
421 def _TermFrequency(self
, term
, document
):
422 """Return the term frequency in the document."""
423 return self
._inverted
_index
.GetDocumentStats(document
).TermFrequency(term
)
426 def document_count(self
):
427 """Returns the count of documents in the index."""
428 return self
._inverted
_index
.document_count
430 def _DocumentCountForTerm(self
, term
):
431 """Returns the document count for documents containing the term."""
432 return len(self
._PostingsForToken
(tokens
.Token(chars
=term
)))
434 def _InverseDocumentFrequency(self
, term
):
435 """Returns inverse document frequency of term."""
436 doc_count
= self
._DocumentCountForTerm
(term
)
438 return math
.log10(self
.document_count
/ float(doc_count
))
442 def _TermFrequencyInverseDocumentFrequency(self
, term
, document
):
443 """Returns the term frequency times inverse document frequency of term."""
444 return (self
._TermFrequency
(term
, document
) *
445 self
._InverseDocumentFrequency
(term
))
447 def _ScoreDocument(self
, document
, score
, terms
):
448 """Scores a document for the given query."""
453 tf_idf
+= self
._TermFrequencyInverseDocumentFrequency
(term
, document
)
456 def _PostingsForToken(self
, token
):
457 """Returns the postings for the token."""
458 return self
._inverted
_index
.GetPostingsForToken(token
)
460 def _CollectTerms(self
, node
):
461 """Get all search terms for scoring."""
462 if node
.getType() in search_util
.TEXT_QUERY_TYPES
:
463 return set([query_parser
.GetQueryNodeText(node
).strip('"')])
465 if node
.getType() == QueryParser
.EQ
and len(node
.children
) > 1:
466 children
= node
.children
[1:]
468 children
= node
.children
471 for term_set
in (self
._CollectTerms
(child
) for child
in children
):
472 result
.update(term_set
)
476 def _CollectFields(self
, node
):
477 if node
.getType() == QueryParser
.EQ
and node
.children
:
478 return set([query_parser
.GetQueryNodeText(node
.children
[0])])
481 for term_set
in (self
._CollectFields
(child
) for child
in node
.children
):
482 result
.update(term_set
)
486 def _Evaluate(self
, node
, score
=True):
487 """Retrieve scored results for a search query."""
488 doc_match
= document_matcher
.DocumentMatcher(node
, self
._inverted
_index
)
490 matched_documents
= doc_match
.FilterDocuments(self
._documents
.itervalues())
491 terms
= self
._CollectTerms
(node
)
493 _ScoredDocument(doc
, self
._ScoreDocument
(doc
, score
, terms
))
494 for doc
in matched_documents
]
495 return scored_documents
497 def _Sort(self
, docs
, search_params
, score
):
499 return sorted(docs
, key
=lambda doc
: doc
.score
, reverse
=True)
501 if not search_params
.sort_spec_size():
502 return sorted(docs
, key
=lambda doc
: doc
.document
.order_id(), reverse
=True)
504 def SortKey(scored_doc
):
505 """Return the sort key for a document based on the request parameters.
508 scored_doc: The document to score
511 The sort key of a document. The sort key is a tuple, where the nth
512 element in the tuple corresponds to the value of the nth sort expression
513 evaluated on the document.
516 Exception: if no default value is specified.
519 for sort_spec
in search_params
.sort_spec_list():
520 if not (sort_spec
.has_default_value_text() or
521 sort_spec
.has_default_value_numeric()):
522 raise Exception('A default value must be specified for sorting.')
523 elif sort_spec
.has_default_value_text():
524 default_value
= sort_spec
.default_value_text()
526 default_value
= sort_spec
.default_value_numeric()
527 val
= expression_evaluator
.ExpressionEvaluator(
528 scored_doc
, self
._inverted
_index
, True).ValueOf(
529 sort_spec
.sort_expression(), default_value
=default_value
)
530 if isinstance(val
, datetime
.datetime
):
531 val
= search_util
.EpochTime(val
)
532 expr_vals
.append(val
)
533 return tuple(expr_vals
)
536 """The comparison function for sort keys."""
539 for i
, val_tuple
in enumerate(zip(x
, y
)):
540 cmp_val
= cmp(*val_tuple
)
542 if search_params
.sort_spec(i
).sort_descending():
546 return sorted(docs
, key
=SortKey
, cmp=SortCmp
)
548 def _AttachExpressions(self
, docs
, search_params
):
549 if search_params
.has_field_spec():
551 evaluator
= expression_evaluator
.ExpressionEvaluator(
552 doc
, self
._inverted
_index
)
553 for expr
in search_params
.field_spec().expression_list():
554 evaluator
.Evaluate(expr
)
557 def Search(self
, search_request
):
558 """Searches the simple index for ."""
559 query
= urllib
.unquote(search_request
.query())
560 query
= query
.strip()
561 score
= _ScoreRequested(search_request
)
563 docs
= [_ScoredDocument(doc
, 0.0) for doc
in self
._documents
.values()]
565 if not isinstance(query
, unicode):
566 query
= unicode(query
, 'utf-8')
567 query_tree
= query_parser
.ParseAndSimplify(query
)
568 docs
= self
._Evaluate
(query_tree
, score
=score
)
569 docs
= self
._Sort
(docs
, search_request
, score
)
570 docs
= self
._AttachExpressions
(docs
, search_request
)
574 """Returns the schema for the index."""
575 return self
._inverted
_index
.GetSchema()
578 return search_util
.Repr(self
, [('_index_spec', self
._index
_spec
),
579 ('_documents', self
._documents
),
580 ('_inverted_index', self
._inverted
_index
)])
583 class SearchServiceStub(apiproxy_stub
.APIProxyStub
):
584 """Simple RAM backed Search service stub.
586 This stub provides the search_service_pb.SearchService. But this is
587 NOT a subclass of SearchService itself. Services are provided by
588 the methods prefixed by "_Dynamic_".
596 def __init__(self
, service_name
='search', index_file
=None):
600 service_name: Service name expected for all calls.
601 index_file: The file to which search indexes will be persisted.
604 self
.__index
_file
= index_file
605 self
.__index
_file
_lock
= threading
.Lock()
606 super(SearchServiceStub
, self
).__init
__(service_name
)
610 def _InvalidRequest(self
, status
, exception
):
611 status
.set_code(search_service_pb
.SearchServiceError
.INVALID_REQUEST
)
612 status
.set_error_detail(exception
.message
)
614 def _UnknownIndex(self
, status
, index_spec
):
615 status
.set_code(search_service_pb
.SearchServiceError
.OK
)
616 status
.set_error_detail('no index for %r' % index_spec
)
618 def _GetNamespace(self
, namespace
):
619 """Get namespace name.
622 namespace: Namespace provided in request arguments.
625 If namespace is None, returns the name of the current global namespace. If
626 namespace is not None, returns namespace.
628 if namespace
is not None:
630 return namespace_manager
.get_namespace()
632 def _GetIndex(self
, index_spec
, create
=False):
633 namespace
= self
._GetNamespace
(index_spec
.namespace())
635 index
= self
.__indexes
.setdefault(namespace
, {}).get(index_spec
.name())
638 index
= SimpleIndex(index_spec
)
639 self
.__indexes
[namespace
][index_spec
.name()] = index
644 def _Dynamic_IndexDocument(self
, request
, response
):
645 """A local implementation of SearchService.IndexDocument RPC.
647 Index a new document or update an existing document.
650 request: A search_service_pb.IndexDocumentRequest.
651 response: An search_service_pb.IndexDocumentResponse.
653 params
= request
.params()
654 index
= self
._GetIndex
(params
.index_spec(), create
=True)
655 index
.IndexDocuments(params
.document_list(), response
)
657 def _Dynamic_DeleteDocument(self
, request
, response
):
658 """A local implementation of SearchService.DeleteDocument RPC.
661 request: A search_service_pb.DeleteDocumentRequest.
662 response: An search_service_pb.DeleteDocumentResponse.
664 params
= request
.params()
665 index_spec
= params
.index_spec()
666 index
= self
._GetIndex
(index_spec
)
668 self
._UnknownIndex
(response
.add_status(), index_spec
)
670 index
.DeleteDocuments(params
.doc_id_list(), response
)
672 def _Dynamic_ListIndexes(self
, request
, response
):
673 """A local implementation of SearchService.ListIndexes RPC.
676 request: A search_service_pb.ListIndexesRequest.
677 response: An search_service_pb.ListIndexesResponse.
680 ResponseTooLargeError: raised for testing admin console.
685 if request
.has_app_id():
686 if random
.choice([True] + [False] * 9):
687 raise apiproxy_errors
.ResponseTooLargeError()
689 for _
in xrange(random
.randint(0, 2) * random
.randint(5, 15)):
690 new_index_spec
= response
.add_index_metadata().mutable_index_spec()
691 new_index_spec
.set_name(
692 random
.choice(list(_VISIBLE_PRINTABLE_ASCII
- set('!'))) +
693 ''.join(random
.choice(list(_VISIBLE_PRINTABLE_ASCII
))
694 for _
in xrange(random
.randint(
695 0, search
.MAXIMUM_INDEX_NAME_LENGTH
))))
696 response
.mutable_status().set_code(
697 random
.choice([search_service_pb
.SearchServiceError
.OK
] * 10 +
698 [search_service_pb
.SearchServiceError
.TRANSIENT_ERROR
] +
699 [search_service_pb
.SearchServiceError
.INTERNAL_ERROR
]))
702 response
.mutable_status().set_code(
703 search_service_pb
.SearchServiceError
.OK
)
705 namespace
= self
._GetNamespace
(request
.params().namespace())
706 if namespace
not in self
.__indexes
or not self
.__indexes
[namespace
]:
709 keys
, indexes
= zip(*sorted(
710 self
.__indexes
[namespace
].iteritems(), key
=lambda v
: v
[0]))
712 params
= request
.params()
713 if params
.has_start_index_name():
714 position
= bisect
.bisect_left(keys
, params
.start_index_name())
715 if (not params
.include_start_index() and position
< len(keys
)
716 and keys
[position
] == params
.start_index_name()):
718 elif params
.has_index_name_prefix():
719 position
= bisect
.bisect_left(keys
, params
.index_name_prefix())
720 if params
.has_offset():
721 position
+= params
.offset()
722 end_position
= position
+ params
.limit()
723 prefix
= params
.index_name_prefix()
724 for index
in indexes
[min(position
, len(keys
)):min(end_position
, len(keys
))]:
725 index_spec
= index
.index_spec
726 if prefix
and not index_spec
.name().startswith(prefix
):
728 metadata
= response
.add_index_metadata()
729 new_index_spec
= metadata
.mutable_index_spec()
730 new_index_spec
.set_name(index_spec
.name())
731 new_index_spec
.set_namespace(index_spec
.namespace())
732 if params
.fetch_schema():
733 self
._AddSchemaInformation
(index
, metadata
)
735 def _AddSchemaInformation(self
, index
, metadata_pb
):
736 schema
= index
.GetSchema()
738 field_types
= schema
[name
]
739 new_field_types
= metadata_pb
.add_field()
740 new_field_types
.MergeFrom(field_types
)
742 def _AddDocument(self
, response
, document
, ids_only
):
743 doc
= response
.add_document()
745 doc
.set_id(document
.id())
747 doc
.MergeFrom(document
)
749 def _Dynamic_ListDocuments(self
, request
, response
):
750 """A local implementation of SearchService.ListDocuments RPC.
753 request: A search_service_pb.ListDocumentsRequest.
754 response: An search_service_pb.ListDocumentsResponse.
756 params
= request
.params()
757 index
= self
._GetIndex
(params
.index_spec(), create
=True)
759 self
._UnknownIndex
(response
.mutable_status(), params
.index_spec())
763 start
= not params
.has_start_doc_id()
764 for document
in sorted(index
.Documents(), key
=lambda doc
: doc
.id()):
766 if num_docs
< params
.limit():
767 self
._AddDocument
(response
, document
, params
.keys_only())
770 if document
.id() >= params
.start_doc_id():
772 if (document
.id() != params
.start_doc_id() or
773 params
.include_start_doc()):
774 self
._AddDocument
(response
, document
, params
.keys_only())
777 response
.mutable_status().set_code(
778 search_service_pb
.SearchServiceError
.OK
)
780 def _RandomSearchResponse(self
, request
, response
):
783 if random
.random() < 0.03:
784 raise apiproxy_errors
.ResponseTooLargeError()
785 response
.mutable_status().set_code(
786 random
.choice([search_service_pb
.SearchServiceError
.OK
] * 30 +
787 [search_service_pb
.SearchServiceError
.TRANSIENT_ERROR
] +
788 [search_service_pb
.SearchServiceError
.INTERNAL_ERROR
]))
790 params
= request
.params()
791 random
.seed(params
.query())
792 total
= random
.randint(0, 100)
795 if random
.random() < 0.3:
799 if params
.has_offset():
800 offset
= params
.offset()
802 remaining
= max(0, total
- offset
)
803 nresults
= min(remaining
, params
.limit())
804 matched_count
= offset
+ nresults
805 if remaining
> nresults
:
806 matched_count
+= random
.randint(1, 100)
808 def RandomText(charset
, min_len
, max_len
):
809 return ''.join(random
.choice(charset
)
810 for _
in xrange(random
.randint(min_len
, max_len
)))
812 for i
in xrange(nresults
):
813 seed
= '%s:%s' % (params
.query(), i
+ offset
)
815 result
= response
.add_result()
816 doc
= result
.mutable_document()
817 doc_id
= RandomText(string
.letters
+ string
.digits
, 8, 10)
820 for _
in params
.sort_spec_list():
821 result
.add_score(random
.random())
823 for name
, probability
in [('creator', 0.90), ('last_change', 0.40)]:
824 if random
.random() < probability
:
825 field
= doc
.add_field()
827 value
= field
.mutable_value()
828 value
.set_type(document_pb
.FieldValue
.TEXT
)
829 value
.set_string_value(
830 RandomText(string
.letters
+ string
.digits
, 2, 10)
833 field
= doc
.add_field()
834 field
.set_name('content')
835 value
= field
.mutable_value()
836 value
.set_type(document_pb
.FieldValue
.TEXT
)
837 value
.set_string_value(
838 RandomText(string
.printable
, 0, 15) + params
.query() +
839 RandomText(string
.printable
+ 10 * string
.whitespace
, 5, 5000))
841 for i
in xrange(random
.randint(0, 2)):
842 field
= doc
.add_field()
843 field
.set_name(RandomText(string
.letters
, 3, 7))
844 value
= field
.mutable_value()
845 value
.set_type(document_pb
.FieldValue
.TEXT
)
846 value
.set_string_value(RandomText(string
.printable
, 0, 100))
848 response
.set_matched_count(matched_count
)
850 def _DefaultFillSearchResponse(self
, params
, results
, response
):
851 """Fills the SearchResponse with the first set of results."""
852 position_range
= range(0, min(params
.limit(), len(results
)))
853 self
._FillSearchResponse
(results
, position_range
, params
.cursor_type(),
854 _ScoreRequested(params
), response
)
856 def _CopyDocument(self
, doc
, doc_copy
, field_names
, ids_only
=None):
857 """Copies Document, doc, to doc_copy restricting fields to field_names."""
858 doc_copy
.set_id(doc
.id())
861 if doc
.has_language():
862 doc_copy
.set_language(doc
.language())
863 for field
in doc
.field_list():
864 if not field_names
or field
.name() in field_names
:
865 doc_copy
.add_field().CopyFrom(field
)
866 doc_copy
.set_order_id(doc
.order_id())
868 def _FillSearchResponse(self
, results
, position_range
, cursor_type
, score
,
869 response
, field_names
=None, ids_only
=None):
870 """Fills the SearchResponse with a selection of results."""
871 for i
in position_range
:
873 search_result
= response
.add_result()
874 self
._CopyDocument
(result
.document
, search_result
.mutable_document(),
875 field_names
, ids_only
)
876 if cursor_type
== search_service_pb
.SearchParams
.PER_RESULT
:
877 search_result
.set_cursor(self
._EncodeCursor
(result
.document
))
879 search_result
.add_score(result
.score
)
880 for field
, expression
in result
.expressions
.iteritems():
881 expr
= search_result
.add_expression()
883 if (isinstance(expression
, float) or
884 isinstance(expression
, long) or
885 isinstance(expression
, int)):
886 expr
.mutable_value().set_string_value(str(expression
))
887 expr
.mutable_value().set_type(document_pb
.FieldValue
.NUMBER
)
889 expr
.mutable_value().set_string_value(expression
)
890 expr
.mutable_value().set_type(document_pb
.FieldValue
.HTML
)
892 def _Dynamic_Search(self
, request
, response
):
893 """A local implementation of SearchService.Search RPC.
896 request: A search_service_pb.SearchRequest.
897 response: An search_service_pb.SearchResponse.
899 if request
.has_app_id():
900 self
._RandomSearchResponse
(request
, response
)
904 index
= self
._GetIndex
(request
.params().index_spec())
906 self
._UnknownIndex
(response
.mutable_status(),
907 request
.params().index_spec())
908 response
.set_matched_count(0)
911 params
= request
.params()
913 results
= index
.Search(params
)
914 except query_parser
.QueryException
, e
:
915 self
._InvalidRequest
(response
.mutable_status(), e
)
916 response
.set_matched_count(0)
918 except document_matcher
.ExpressionTreeException
, e
:
919 self
._InvalidRequest
(response
.mutable_status(), e
)
920 response
.set_matched_count(0)
922 response
.set_matched_count(len(results
))
925 if params
.has_cursor():
926 doc_id
= self
._DecodeCursor
(params
.cursor())
927 for i
, result
in enumerate(results
):
928 if result
.document
.id() == doc_id
:
931 elif params
.has_offset():
932 offset
= params
.offset()
936 if offset
< len(results
):
939 limit
= offset
+ params
.limit()
940 if limit
>= len(results
):
943 range_end
= len(results
)
949 if params
.cursor_type() == search_service_pb
.SearchParams
.SINGLE
:
950 document
= results
[range_end
- 1].document
951 response
.set_cursor(self
._EncodeCursor
(document
))
952 result_range
= range(offset
, range_end
)
954 result_range
= range(0)
955 field_names
= params
.field_spec().name_list()
956 self
._FillSearchResponse
(results
, result_range
, params
.cursor_type(),
957 _ScoreRequested(params
), response
, field_names
,
960 response
.mutable_status().set_code(search_service_pb
.SearchServiceError
.OK
)
962 def _EncodeCursor(self
, document
):
963 return base64
.urlsafe_b64encode(document
.id())
965 def _DecodeCursor(self
, cursor
):
966 return base64
.urlsafe_b64decode(cursor
)
969 return search_util
.Repr(self
, [('__indexes', self
.__indexes
)])
972 """Write search indexes to the index file.
974 This method is a no-op if index_file is set to None.
976 if not self
.__index
_file
:
983 descriptor
, tmp_filename
= tempfile
.mkstemp(
984 dir=os
.path
.dirname(self
.__index
_file
))
985 tmpfile
= os
.fdopen(descriptor
, 'wb')
987 pickler
= pickle
.Pickler(tmpfile
, protocol
=1)
989 pickler
.dump((self
._VERSION
, self
.__indexes
))
993 self
.__index
_file
_lock
.acquire()
997 os
.rename(tmp_filename
, self
.__index
_file
)
1001 os
.remove(self
.__index
_file
)
1002 os
.rename(tmp_filename
, self
.__index
_file
)
1004 self
.__index
_file
_lock
.release()
1006 def _ReadFromFile(self
):
1007 self
.__index
_file
_lock
.acquire()
1009 if os
.path
.isfile(self
.__index
_file
):
1010 version
, indexes
= pickle
.load(open(self
.__index
_file
, 'rb'))
1011 if version
== self
._VERSION
:
1014 'Saved search indexes are not compatible with this version of the '
1015 'SDK. Search indexes have been cleared.')
1018 'Could not read search indexes from %s', self
.__index
_file
)
1019 except (AttributeError, LookupError, ImportError, NameError, TypeError,
1020 ValueError, pickle
.PickleError
, IOError), e
:
1022 'Could not read indexes from %s. Try running with the '
1023 '--clear_search_index flag. Cause:\n%r' % (self
.__index
_file
, e
))
1025 self
.__index
_file
_lock
.release()
1030 """Read search indexes from the index file.
1032 This method is a no-op if index_file is set to None.
1034 if not self
.__index
_file
:
1036 read_indexes
= self
._ReadFromFile
()
1038 self
.__indexes
= read_indexes