3 # Copyright 2007 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
21 """A Python Search API used by app developers.
23 Contains methods used to interface with Search API.
24 Contains API classes that forward to apiproxy.
39 from google
.appengine
.datastore
import document_pb
40 from google
.appengine
.api
import apiproxy_stub_map
41 from google
.appengine
.api
import datastore_types
42 from google
.appengine
.api
import namespace_manager
43 from google
.appengine
.api
.search
import expression_parser
44 from google
.appengine
.api
.search
import query_parser
45 from google
.appengine
.api
.search
import search_service_pb
46 from google
.appengine
.api
.search
import search_util
47 from google
.appengine
.datastore
import datastore_rpc
48 from google
.appengine
.runtime
import apiproxy_errors
53 'ConcurrentTransactionError',
59 'DOCUMENT_ID_FIELD_NAME',
72 'LANGUAGE_FIELD_NAME',
74 'MAXIMUM_DOCUMENT_ID_LENGTH',
75 'MAXIMUM_DOCUMENTS_PER_PUT_REQUEST',
76 'MAXIMUM_DOCUMENTS_RETURNED_PER_SEARCH',
77 'MAXIMUM_EXPRESSION_LENGTH',
78 'MAXIMUM_FIELD_ATOM_LENGTH',
79 'MAXIMUM_FIELD_NAME_LENGTH',
80 'MAXIMUM_FIELD_VALUE_LENGTH',
81 'MAXIMUM_FIELDS_RETURNED_PER_SEARCH',
82 'MAXIMUM_GET_INDEXES_OFFSET',
83 'MAXIMUM_INDEX_NAME_LENGTH',
84 'MAXIMUM_INDEXES_RETURNED_PER_GET_REQUEST',
85 'MAXIMUM_NUMBER_FOUND_ACCURACY',
86 'MAXIMUM_QUERY_LENGTH',
87 'MAXIMUM_SEARCH_OFFSET',
88 'MAXIMUM_SORTED_DOCUMENTS',
101 'RescoringMatchScorer',
109 'TIMESTAMP_FIELD_NAME',
113 MAXIMUM_INDEX_NAME_LENGTH
= 100
114 MAXIMUM_FIELD_VALUE_LENGTH
= 1024 * 1024
115 MAXIMUM_FIELD_ATOM_LENGTH
= 500
116 MAXIMUM_FIELD_NAME_LENGTH
= 500
117 MAXIMUM_DOCUMENT_ID_LENGTH
= 500
118 MAXIMUM_DOCUMENTS_PER_PUT_REQUEST
= 200
119 MAXIMUM_EXPRESSION_LENGTH
= 5000
120 MAXIMUM_QUERY_LENGTH
= 2000
121 MAXIMUM_DOCUMENTS_RETURNED_PER_SEARCH
= 1000
122 MAXIMUM_SEARCH_OFFSET
= 1000
124 MAXIMUM_SORTED_DOCUMENTS
= 10000
125 MAXIMUM_NUMBER_FOUND_ACCURACY
= 10000
126 MAXIMUM_FIELDS_RETURNED_PER_SEARCH
= 100
127 MAXIMUM_INDEXES_RETURNED_PER_GET_REQUEST
= 1000
128 MAXIMUM_GET_INDEXES_OFFSET
= 1000
131 DOCUMENT_ID_FIELD_NAME
= '_doc_id'
133 LANGUAGE_FIELD_NAME
= '_lang'
135 RANK_FIELD_NAME
= '_rank'
137 SCORE_FIELD_NAME
= '_score'
141 TIMESTAMP_FIELD_NAME
= '_timestamp'
146 _LANGUAGE_RE
= re
.compile('^(.{2}|.{2}_.{2})$')
148 _MAXIMUM_STRING_LENGTH
= 500
149 _MAXIMUM_CURSOR_LENGTH
= 10000
151 _VISIBLE_PRINTABLE_ASCII
= frozenset(
152 set(string
.printable
) - set(string
.whitespace
))
153 _FIELD_NAME_PATTERN
= '^[A-Za-z][A-Za-z0-9_]*$'
155 MAX_DATE
= datetime
.datetime(
156 datetime
.MAXYEAR
, 12, 31, 23, 59, 59, 999999, tzinfo
=None)
157 MIN_DATE
= datetime
.datetime(
158 datetime
.MINYEAR
, 1, 1, 0, 0, 0, 0, tzinfo
=None)
161 MAX_NUMBER_VALUE
= 2147483647
162 MIN_NUMBER_VALUE
= -2147483647
165 _PROTO_FIELDS_STRING_VALUE
= frozenset([document_pb
.FieldValue
.TEXT
,
166 document_pb
.FieldValue
.HTML
,
167 document_pb
.FieldValue
.ATOM
])
170 class Error(Exception):
171 """Indicates a call on the search API has failed."""
174 class InternalError(Error
):
175 """Indicates a call on the search API has failed on the internal backend."""
178 class TransientError(Error
):
179 """Indicates a call on the search API has failed, but retrying may succeed."""
182 class InvalidRequest(Error
):
183 """Indicates an invalid request was made on the search API by the client."""
186 class QueryError(Error
):
187 """An error occurred while parsing a query input string."""
190 class ExpressionError(Error
):
191 """An error occurred while parsing an expression input string."""
194 class Timeout(Error
):
195 """Indicates a call on the search API could not finish before its deadline."""
198 class ConcurrentTransactionError(Error
):
199 """Indicates a call on the search API failed due to concurrent updates."""
202 def _ConvertToUnicode(some_string
):
203 """Convert UTF-8 encoded string to unicode."""
204 if some_string
is None:
206 if isinstance(some_string
, unicode):
208 return unicode(some_string
, 'utf-8')
211 def _ConcatenateErrorMessages(prefix
, status
):
212 """Returns an error message combining prefix and status.error_detail()."""
213 if status
.error_detail():
214 return prefix
+ ': ' + status
.error_detail()
218 class OperationResult(object):
219 """Represents result of individual operation of a batch index or removal.
221 This is an abstract class.
224 (OK
, INVALID_REQUEST
, TRANSIENT_ERROR
, INTERNAL_ERROR
,
225 TIMEOUT
, CONCURRENT_TRANSACTION
) = (
226 'OK', 'INVALID_REQUEST', 'TRANSIENT_ERROR', 'INTERNAL_ERROR',
227 'TIMEOUT', 'CONCURRENT_TRANSACTION')
229 _CODES
= frozenset([OK
, INVALID_REQUEST
, TRANSIENT_ERROR
, INTERNAL_ERROR
,
230 TIMEOUT
, CONCURRENT_TRANSACTION
])
232 def __init__(self
, code
, message
=None, id=None):
236 code: The error or success code of the operation.
237 message: An error message associated with any error.
238 id: The id of the object some operation was performed on.
241 TypeError: If an unknown attribute is passed.
242 ValueError: If an unknown code is passed.
244 self
._message
= _ConvertToUnicode(message
)
246 if self
._code
not in self
._CODES
:
247 raise ValueError('Unknown operation result code %r, must be one of %s'
248 % (self
._code
, self
._CODES
))
249 self
._id
= _ConvertToUnicode(id)
253 """Returns the code indicating the status of the operation."""
258 """Returns any associated error message if the operation was in error."""
263 """Returns the Id of the object the operation was performed on."""
267 return _Repr(self
, [('code', self
.code
), ('message', self
.message
),
271 _ERROR_OPERATION_CODE_MAP
= {
272 search_service_pb
.SearchServiceError
.OK
: OperationResult
.OK
,
273 search_service_pb
.SearchServiceError
.INVALID_REQUEST
:
274 OperationResult
.INVALID_REQUEST
,
275 search_service_pb
.SearchServiceError
.TRANSIENT_ERROR
:
276 OperationResult
.TRANSIENT_ERROR
,
277 search_service_pb
.SearchServiceError
.INTERNAL_ERROR
:
278 OperationResult
.INTERNAL_ERROR
,
279 search_service_pb
.SearchServiceError
.TIMEOUT
:
280 OperationResult
.TIMEOUT
,
281 search_service_pb
.SearchServiceError
.CONCURRENT_TRANSACTION
:
282 OperationResult
.CONCURRENT_TRANSACTION
,
286 class PutResult(OperationResult
):
287 """The result of indexing a single object."""
290 class DeleteResult(OperationResult
):
291 """The result of deleting a single document."""
294 class PutError(Error
):
295 """Indicates some error occurred indexing one of the objects requested."""
297 def __init__(self
, message
, results
):
301 message: A message detailing the cause of the failure to index some
303 results: A list of PutResult corresponding to the list of objects
304 requested to be indexed.
306 super(PutError
, self
).__init
__(message
)
307 self
._results
= results
311 """Returns PutResult list corresponding to objects indexed."""
315 class DeleteError(Error
):
316 """Indicates some error occured deleting one of the objects requested."""
318 def __init__(self
, message
, results
):
322 message: A message detailing the cause of the failure to delete some
324 results: A list of DeleteResult corresponding to the list of Ids of
325 objects requested to be deleted.
327 super(DeleteError
, self
).__init
__(message
)
328 self
._results
= results
332 """Returns DeleteResult list corresponding to Documents deleted."""
337 search_service_pb
.SearchServiceError
.INVALID_REQUEST
: InvalidRequest
,
338 search_service_pb
.SearchServiceError
.TRANSIENT_ERROR
: TransientError
,
339 search_service_pb
.SearchServiceError
.INTERNAL_ERROR
: InternalError
,
340 search_service_pb
.SearchServiceError
.TIMEOUT
: Timeout
,
341 search_service_pb
.SearchServiceError
.CONCURRENT_TRANSACTION
:
342 ConcurrentTransactionError
,
346 def _ToSearchError(error
):
347 """Translate an application error to a search Error, if possible.
350 error: An ApplicationError to translate.
353 An Error if the error is known, otherwise the given
354 apiproxy_errors.ApplicationError.
356 if error
.application_error
in _ERROR_MAP
:
357 return _ERROR_MAP
[error
.application_error
](error
.error_detail
)
361 def _CheckInteger(value
, name
, zero_ok
=True, upper_bound
=None):
362 """Checks whether value is an integer between the lower and upper bounds.
365 value: The value to check.
366 name: The name of the value, to use in error messages.
367 zero_ok: True if zero is allowed.
368 upper_bound: The upper (inclusive) bound of the value. Optional.
374 ValueError: If the value is not a int or long, or is out of range.
376 datastore_types
.ValidateInteger(value
, name
, ValueError, empty_ok
=True,
378 if upper_bound
is not None and value
> upper_bound
:
379 raise ValueError('%s, %d must be <= %d' % (name
, value
, upper_bound
))
383 def _CheckEnum(value
, name
, values
=None):
384 """Checks whether value is a member of the set of values given.
387 value: The value to check.
388 name: The name of the value, to use in error messages.
389 values: The iterable of possible values.
395 ValueError: If the value is not one of the allowable values.
397 if value
not in values
:
398 raise ValueError('%s, %r must be in %s' % (name
, value
, values
))
402 def _CheckNumber(value
, name
):
403 """Checks whether value is a number.
406 value: The value to check.
407 name: The name of the value, to use in error messages.
413 TypeError: If the value is not a number.
415 if not isinstance(value
, (int, long, float)):
416 raise TypeError('%s must be a int, long or float, got %s' %
417 (name
, value
.__class
__.__name
__))
421 def _CheckStatus(status
):
422 """Checks whether a RequestStatus has a value of OK.
425 status: The RequestStatus to check.
428 Error: A subclass of Error if the value of status is not OK.
429 The subclass of Error is chosen based on value of the status code.
430 InternalError: If the status value is unknown.
432 if status
.code() != search_service_pb
.SearchServiceError
.OK
:
433 if status
.code() in _ERROR_MAP
:
434 raise _ERROR_MAP
[status
.code()](status
.error_detail())
436 raise InternalError(status
.error_detail())
439 def _ValidateString(value
,
441 max_len
=_MAXIMUM_STRING_LENGTH
,
443 type_exception
=TypeError,
444 value_exception
=ValueError):
445 """Raises an exception if value is not a valid string or a subclass thereof.
447 A string is valid if it's not empty, no more than _MAXIMUM_STRING_LENGTH
448 bytes. The exception type can be specified with the exception
449 arguments for type and value issues.
452 value: The value to validate.
453 name: The name of this value; used in the exception message.
454 max_len: The maximum allowed length, in bytes.
455 empty_ok: Allow empty value.
456 type_exception: The type of exception to raise if not a basestring.
457 value_exception: The type of exception to raise if invalid value.
463 TypeError: If value is not a basestring or subclass.
464 ValueError: If the value is None or longer than max_len.
466 if value
is None and empty_ok
:
468 if value
is not None and not isinstance(value
, basestring
):
469 raise type_exception('%s must be a basestring; got %s:' %
470 (name
, value
.__class
__.__name
__))
471 if not value
and not empty_ok
:
472 raise value_exception('%s must not be empty.' % name
)
474 if len(value
.encode('utf-8')) > max_len
:
475 raise value_exception('%s must be under %d bytes.' % (name
, max_len
))
479 def _ValidateVisiblePrintableAsciiNotReserved(value
, name
):
480 """Checks if value is a visible printable ASCII string not starting with '!'.
482 Whitespace characters are excluded. Printable visible ASCII
483 strings starting with '!' are reserved for internal use.
486 value: The string to validate.
487 name: The name of this string; used in the exception message.
493 ValueError: If the string is not visible printable ASCII, or starts with
497 if char
not in _VISIBLE_PRINTABLE_ASCII
:
499 '%r must be visible printable ASCII: %r'
501 if value
.startswith('!'):
502 raise ValueError('%r must not start with "!": %r' % (name
, value
))
506 def _CheckIndexName(index_name
):
507 """Checks index_name is a string which is not too long, and returns it.
509 Index names must be visible printable ASCII and not start with '!'.
511 _ValidateString(index_name
, 'index name', MAXIMUM_INDEX_NAME_LENGTH
)
512 return _ValidateVisiblePrintableAsciiNotReserved(index_name
, 'index_name')
515 def _CheckFieldName(name
):
516 """Checks field name is not too long and matches field name pattern.
518 Field name pattern: "[A-Za-z][A-Za-z0-9_]*".
520 _ValidateString(name
, 'name', MAXIMUM_FIELD_NAME_LENGTH
)
521 if not re
.match(_FIELD_NAME_PATTERN
, name
):
522 raise ValueError('field name "%s" should match pattern: %s' %
523 (name
, _FIELD_NAME_PATTERN
))
527 def _CheckExpression(expression
):
528 """Checks whether the expression is a string."""
529 expression
= _ValidateString(expression
, max_len
=MAXIMUM_EXPRESSION_LENGTH
)
531 expression_parser
.Parse(expression
)
532 except expression_parser
.ExpressionException
, e
:
533 raise ExpressionError('Failed to parse expression "%s"' % expression
)
537 def _CheckFieldNames(names
):
538 """Checks each name in names is a valid field name."""
540 _CheckFieldName(name
)
544 def _GetList(a_list
):
545 """Utility function that converts None to the empty list."""
552 def _ConvertToList(arg
):
553 """Converts arg to a list, empty if None, single element if not a list."""
554 if isinstance(arg
, basestring
):
558 return list(iter(arg
))
564 def _ConvertToUnicodeList(arg
):
565 """Converts arg to a list of unicode objects."""
566 return [_ConvertToUnicode(value
) for value
in _ConvertToList(arg
)]
569 def _CheckDocumentId(doc_id
):
570 """Checks doc_id is a valid document identifier, and returns it.
572 Document ids must be visible printable ASCII and not start with '!'.
574 _ValidateString(doc_id
, 'doc_id', MAXIMUM_DOCUMENT_ID_LENGTH
)
575 _ValidateVisiblePrintableAsciiNotReserved(doc_id
, 'doc_id')
579 def _CheckText(value
, name
='value', empty_ok
=True):
580 """Checks the field text is a valid string."""
581 return _ValidateString(value
, name
, MAXIMUM_FIELD_VALUE_LENGTH
, empty_ok
)
584 def _CheckHtml(html
):
585 """Checks the field html is a valid HTML string."""
586 return _ValidateString(html
, 'html', MAXIMUM_FIELD_VALUE_LENGTH
,
590 def _CheckAtom(atom
):
591 """Checks the field atom is a valid string."""
592 return _ValidateString(atom
, 'atom', MAXIMUM_FIELD_ATOM_LENGTH
,
596 def _CheckDate(date
):
597 """Checks the date is in the correct range."""
598 if isinstance(date
, datetime
.datetime
):
599 if date
< MIN_DATE
or date
> MAX_DATE
:
600 raise TypeError('date must be between %s and %s (got %s)' %
601 (MIN_DATE
, MAX_DATE
, date
))
602 elif isinstance(date
, datetime
.date
):
603 if date
< MIN_DATE
.date() or date
> MAX_DATE
.date():
604 raise TypeError('date must be between %s and %s (got %s)' %
605 (MIN_DATE
, MAX_DATE
, date
))
607 raise TypeError('date must be datetime.datetime or datetime.date')
611 def _CheckLanguage(language
):
612 """Checks language is None or a string that matches _LANGUAGE_RE."""
615 if not isinstance(language
, basestring
):
616 raise TypeError('language must be a basestring, got %s' %
617 language
.__class
__.__name
__)
618 if not re
.match(_LANGUAGE_RE
, language
):
619 raise ValueError('invalid language %s. Languages should be two letters.'
624 def _CheckDocument(document
):
625 """Check that the document is valid.
627 This checks for all server-side requirements on Documents. Currently, that
628 means ensuring that there are no repeated number or date fields.
631 document: The search.Document to check for validity.
634 ValueError if the document is invalid in a way that would trigger an
635 PutError from the server.
637 no_repeat_date_names
= set()
638 no_repeat_number_names
= set()
639 for field
in document
.fields
:
640 if isinstance(field
, NumberField
):
641 if field
.name
in no_repeat_number_names
:
643 'Invalid document %s: field %s with type date or number may not '
644 'be repeated.' % (document
.doc_id
, field
.name
))
645 no_repeat_number_names
.add(field
.name
)
646 elif isinstance(field
, DateField
):
647 if field
.name
in no_repeat_date_names
:
649 'Invalid document %s: field %s with type date or number may not '
650 'be repeated.' % (document
.doc_id
, field
.name
))
651 no_repeat_date_names
.add(field
.name
)
654 def _CheckSortLimit(limit
):
655 """Checks the limit on number of docs to score or sort is not too large."""
656 return _CheckInteger(limit
, 'limit', upper_bound
=MAXIMUM_SORTED_DOCUMENTS
)
659 def _Repr(class_instance
, ordered_dictionary
):
660 """Generates an unambiguous representation for instance and ordered dict."""
661 return u
'search.%s(%s)' % (class_instance
.__class
__.__name
__, ', '.join(
662 ['%s=%r' % (key
, value
) for (key
, value
) in ordered_dictionary
663 if value
is not None and value
!= []]))
666 def _ListIndexesResponsePbToGetResponse(response
):
667 """Returns a GetResponse constructed from get_indexes response pb."""
669 results
=[_NewIndexFromPb(index
)
670 for index
in response
.index_metadata_list()])
672 @datastore_rpc._positional
(7)
673 def get_indexes(namespace
='', offset
=None, limit
=20,
674 start_index_name
=None, include_start_index
=True,
675 index_name_prefix
=None, fetch_schema
=False, deadline
=None,
677 """Returns a list of available indexes.
680 namespace: The namespace of indexes to be returned. If not set
681 then the current namespace is used.
682 offset: The offset of the first returned index.
683 limit: The number of indexes to return.
684 start_index_name: The name of the first index to be returned.
685 include_start_index: Whether or not to return the start index.
686 index_name_prefix: The prefix used to select returned indexes.
687 fetch_schema: Whether to retrieve Schema for each Index or not.
690 deadline: Deadline for RPC call in seconds; if None use the default.
693 The GetResponse containing a list of available indexes.
696 InternalError: If the request fails on internal servers.
697 TypeError: If any of the parameters have invalid types, or an unknown
699 ValueError: If any of the parameters have invalid values (e.g., a
703 app_id
= kwargs
.pop('app_id', None)
705 raise TypeError('Invalid arguments: %s' % ', '.join(kwargs
))
707 response
= _GetIndexes(
708 namespace
=namespace
, offset
=offset
, limit
=limit
,
709 start_index_name
=start_index_name
,
710 include_start_index
=include_start_index
,
711 index_name_prefix
=index_name_prefix
,
712 fetch_schema
=fetch_schema
, deadline
=deadline
, app_id
=app_id
)
713 return _ListIndexesResponsePbToGetResponse(response
)
716 def _GetIndexes(namespace
='', offset
=None, limit
=20,
717 start_index_name
=None, include_start_index
=True,
718 index_name_prefix
=None, fetch_schema
=False, deadline
=None,
720 """Returns a ListIndexesResponse."""
722 request
= search_service_pb
.ListIndexesRequest()
723 params
= request
.mutable_params()
725 if namespace
is None:
726 namespace
= namespace_manager
.get_namespace()
727 if namespace
is None:
729 namespace_manager
.validate_namespace(namespace
, exception
=ValueError)
730 params
.set_namespace(namespace
)
731 if offset
is not None:
732 params
.set_offset(_CheckInteger(offset
, 'offset', zero_ok
=True,
733 upper_bound
=MAXIMUM_GET_INDEXES_OFFSET
))
734 params
.set_limit(_CheckInteger(
735 limit
, 'limit', zero_ok
=False,
736 upper_bound
=MAXIMUM_INDEXES_RETURNED_PER_GET_REQUEST
))
737 if start_index_name
is not None:
738 params
.set_start_index_name(
739 _ValidateString(start_index_name
, 'start_index_name',
740 MAXIMUM_INDEX_NAME_LENGTH
,
742 if include_start_index
is not None:
743 params
.set_include_start_index(bool(include_start_index
))
744 if index_name_prefix
is not None:
745 params
.set_index_name_prefix(
746 _ValidateString(index_name_prefix
, 'index_name_prefix',
747 MAXIMUM_INDEX_NAME_LENGTH
,
749 params
.set_fetch_schema(fetch_schema
)
751 response
= search_service_pb
.ListIndexesResponse()
753 request
.set_app_id(app_id
)
755 _MakeSyncSearchServiceCall('ListIndexes', request
, response
, deadline
)
757 _CheckStatus(response
.status())
761 """An abstract base class which represents a field of a document.
763 This class should not be directly instantiated.
767 TEXT
, HTML
, ATOM
, DATE
, NUMBER
, GEO_POINT
= ('TEXT', 'HTML', 'ATOM', 'DATE',
768 'NUMBER', 'GEO_POINT')
770 _FIELD_TYPES
= frozenset([TEXT
, HTML
, ATOM
, DATE
, NUMBER
, GEO_POINT
])
772 def __init__(self
, name
, value
, language
=None):
776 name: The name of the field. Field names must have maximum length
777 MAXIMUM_FIELD_NAME_LENGTH and match pattern "[A-Za-z][A-Za-z0-9_]*".
778 value: The value of the field which can be a str, unicode or date.
779 language: The ISO 693-1 two letter code of the language used in the value.
780 See http://www.sil.org/iso639-3/codes.asp?order=639_1&letter=%25 for a
781 list of valid codes. Correct specification of language code will assist
782 in correct tokenization of the field. If None is given, then the
783 language code of the document will be used.
786 TypeError: If any of the parameters have invalid types, or an unknown
788 ValueError: If any of the parameters have invalid values.
790 self
._name
= _CheckFieldName(_ConvertToUnicode(name
))
791 self
._value
= self
._CheckValue
(value
)
792 self
._language
= _CheckLanguage(_ConvertToUnicode(language
))
796 """Returns the name of the field."""
801 """Returns the code of the language the content in value is written in."""
802 return self
._language
806 """Returns the value of the field."""
809 def _CheckValue(self
, value
):
810 """Checks the value is valid for the given type.
813 value: The value to check.
818 raise NotImplementedError('_CheckValue is an abstract method')
821 return _Repr(self
, [('name', self
.name
), ('language', self
.language
),
822 ('value', self
.value
)])
824 def __eq__(self
, other
):
825 return isinstance(other
, type(self
)) and self
.__key
() == other
.__key
()
827 def __ne__(self
, other
):
828 return not self
== other
831 return (self
.name
, self
.value
, self
.language
)
834 return hash(self
.__key
())
839 def _CopyStringValueToProtocolBuffer(self
, field_value_pb
):
840 """Copies value to a string value in proto buf."""
841 field_value_pb
.set_string_value(self
.value
.encode('utf-8'))
844 def _CopyFieldToProtocolBuffer(field
, pb
):
845 """Copies field's contents to a document_pb.Field protocol buffer."""
846 pb
.set_name(field
.name
.encode('utf-8'))
847 field_value_pb
= pb
.mutable_value()
849 field_value_pb
.set_language(field
.language
.encode('utf-8'))
850 if field
.value
is not None:
851 field
._CopyValueToProtocolBuffer
(field_value_pb
)
855 class TextField(Field
):
856 """A Field that has text content.
858 The following example shows a text field named signature with Polish content:
859 TextField(name='signature', value='brzydka pogoda', language='pl')
862 def __init__(self
, name
, value
=None, language
=None):
866 name: The name of the field.
867 value: A str or unicode object containing text.
868 language: The code of the language the value is encoded in.
871 TypeError: If value is not a string.
872 ValueError: If value is longer than allowed.
874 Field
.__init
__(self
, name
, _ConvertToUnicode(value
), language
)
876 def _CheckValue(self
, value
):
877 return _CheckText(value
)
879 def _CopyValueToProtocolBuffer(self
, field_value_pb
):
880 field_value_pb
.set_type(document_pb
.FieldValue
.TEXT
)
881 self
._CopyStringValueToProtocolBuffer
(field_value_pb
)
884 class HtmlField(Field
):
885 """A Field that has HTML content.
887 The following example shows an html field named content:
888 HtmlField(name='content', value='<html>herbata, kawa</html>', language='pl')
891 def __init__(self
, name
, value
=None, language
=None):
895 name: The name of the field.
896 value: A str or unicode object containing the searchable content of the
898 language: The code of the language the value is encoded in.
901 TypeError: If value is not a string.
902 ValueError: If value is longer than allowed.
904 Field
.__init
__(self
, name
, _ConvertToUnicode(value
), language
)
906 def _CheckValue(self
, value
):
907 return _CheckHtml(value
)
909 def _CopyValueToProtocolBuffer(self
, field_value_pb
):
910 field_value_pb
.set_type(document_pb
.FieldValue
.HTML
)
911 self
._CopyStringValueToProtocolBuffer
(field_value_pb
)
914 class AtomField(Field
):
915 """A Field that has content to be treated as a single token for indexing.
917 The following example shows an atom field named contributor:
918 AtomField(name='contributor', value='foo@bar.com')
921 def __init__(self
, name
, value
=None, language
=None):
925 name: The name of the field.
926 value: A str or unicode object to be treated as an indivisible text value.
927 language: The code of the language the value is encoded in.
930 TypeError: If value is not a string.
931 ValueError: If value is longer than allowed.
933 Field
.__init
__(self
, name
, _ConvertToUnicode(value
), language
)
935 def _CheckValue(self
, value
):
936 return _CheckAtom(value
)
938 def _CopyValueToProtocolBuffer(self
, field_value_pb
):
939 field_value_pb
.set_type(document_pb
.FieldValue
.ATOM
)
940 self
._CopyStringValueToProtocolBuffer
(field_value_pb
)
943 class DateField(Field
):
944 """A Field that has a date or datetime value.
946 The following example shows a date field named creation_date:
947 DateField(name='creation_date', value=datetime.date(2011, 03, 11))
950 def __init__(self
, name
, value
=None):
954 name: The name of the field.
955 value: A datetime.date or a datetime.datetime.
958 TypeError: If value is not a datetime.date or a datetime.datetime.
960 Field
.__init
__(self
, name
, value
)
962 def _CheckValue(self
, value
):
963 return _CheckDate(value
)
965 def _CopyValueToProtocolBuffer(self
, field_value_pb
):
966 field_value_pb
.set_type(document_pb
.FieldValue
.DATE
)
967 field_value_pb
.set_string_value(search_util
.SerializeDate(self
.value
))
970 class NumberField(Field
):
971 """A Field that has a numeric value.
973 The following example shows a number field named size:
974 NumberField(name='size', value=10)
977 def __init__(self
, name
, value
=None):
981 name: The name of the field.
982 value: A numeric value.
985 TypeError: If value is not numeric.
986 ValueError: If value is out of range.
988 Field
.__init
__(self
, name
, value
)
990 def _CheckValue(self
, value
):
991 value
= _CheckNumber(value
, 'field value')
992 if value
is not None and (value
< MIN_NUMBER_VALUE
or
993 value
> MAX_NUMBER_VALUE
):
994 raise ValueError('value, %d must be between %d and %d' %
995 (value
, MIN_NUMBER_VALUE
, MAX_NUMBER_VALUE
))
998 def _CopyValueToProtocolBuffer(self
, field_value_pb
):
999 field_value_pb
.set_type(document_pb
.FieldValue
.NUMBER
)
1000 field_value_pb
.set_string_value(str(self
.value
))
1003 class GeoPoint(object):
1004 """Represents a point on the Earth's surface, in lat, long coordinates."""
1006 def __init__(self
, latitude
, longitude
):
1010 latitude: The angle between the equatorial plan and a line that passes
1011 through the GeoPoint, between -90 and 90 degrees.
1012 longitude: The angle east or west from a reference meridian to another
1013 meridian that passes through the GeoPoint, between -180 and 180 degrees.
1016 TypeError: If any of the parameters have invalid types, or an unknown
1017 attribute is passed.
1018 ValueError: If any of the parameters have invalid values.
1020 self
._latitude
= self
._CheckLatitude
(latitude
)
1021 self
._longitude
= self
._CheckLongitude
(longitude
)
1025 """Returns the angle between equatorial plan and line thru the geo point."""
1026 return self
._latitude
1029 def longitude(self
):
1030 """Returns the angle from a reference meridian to another meridian."""
1031 return self
._longitude
1033 def _CheckLatitude(self
, value
):
1034 _CheckNumber(value
, 'latitude')
1035 if value
< -90.0 or value
> 90.0:
1036 raise ValueError('latitude must be between -90 and 90 degrees '
1037 'inclusive, was %f' % value
)
1040 def _CheckLongitude(self
, value
):
1041 _CheckNumber(value
, 'longitude')
1042 if value
< -180.0 or value
> 180.0:
1043 raise ValueError('longitude must be between -180 and 180 degrees '
1044 'inclusive, was %f' % value
)
1047 def __eq__(self
, other
):
1048 return (self
.latitude
== other
.latitude
and
1049 self
.longitude
== other
.longitude
)
1053 [('latitude', self
.latitude
),
1054 ('longitude', self
.longitude
)])
1057 def _CheckGeoPoint(geo_point
):
1058 """Checks geo_point is a GeoPoint and returns it."""
1059 if not isinstance(geo_point
, GeoPoint
):
1060 raise TypeError('geo_point must be a GeoPoint, got %s' %
1061 geo_point
.__class
__.__name
__)
1065 class GeoField(Field
):
1066 """A Field that has a GeoPoint value.
1068 The following example shows a geo field named place:
1070 GeoField(name='place', value=GeoPoint(latitude=-33.84, longitude=151.26))
1073 def __init__(self
, name
, value
=None):
1077 name: The name of the field.
1078 value: A GeoPoint value.
1081 TypeError: If value is not numeric.
1083 Field
.__init
__(self
, name
, value
)
1085 def _CheckValue(self
, value
):
1086 return _CheckGeoPoint(value
)
1088 def _CopyValueToProtocolBuffer(self
, field_value_pb
):
1089 field_value_pb
.set_type(document_pb
.FieldValue
.GEO
)
1090 geo_pb
= field_value_pb
.mutable_geo()
1091 geo_pb
.set_lat(self
.value
.latitude
)
1092 geo_pb
.set_lng(self
.value
.longitude
)
1095 def _GetValue(value_pb
):
1096 """Gets the value from the value_pb."""
1097 if value_pb
.type() in _PROTO_FIELDS_STRING_VALUE
:
1098 if value_pb
.has_string_value():
1099 return value_pb
.string_value()
1101 if value_pb
.type() == document_pb
.FieldValue
.DATE
:
1102 if value_pb
.has_string_value():
1103 return search_util
.DeserializeDate(value_pb
.string_value())
1105 if value_pb
.type() == document_pb
.FieldValue
.NUMBER
:
1106 if value_pb
.has_string_value():
1107 return float(value_pb
.string_value())
1109 if value_pb
.type() == document_pb
.FieldValue
.GEO
:
1110 if value_pb
.has_geo():
1111 geo_pb
= value_pb
.geo()
1112 return GeoPoint(latitude
=geo_pb
.lat(), longitude
=geo_pb
.lng())
1114 raise TypeError('unknown FieldValue type %d' % value_pb
.type())
1117 _STRING_TYPES
= set([document_pb
.FieldValue
.TEXT
,
1118 document_pb
.FieldValue
.HTML
,
1119 document_pb
.FieldValue
.ATOM
])
1122 def _DecodeUTF8(pb_value
):
1123 """Decodes a UTF-8 encoded string into unicode."""
1124 if pb_value
is not None:
1125 return pb_value
.decode('utf-8')
1129 def _DecodeValue(pb_value
, val_type
):
1130 """Decodes a possible UTF-8 encoded string value to unicode."""
1131 if val_type
in _STRING_TYPES
:
1132 return _DecodeUTF8(pb_value
)
1136 def _NewFieldFromPb(pb
):
1137 """Constructs a Field from a document_pb.Field protocol buffer."""
1138 name
= _DecodeUTF8(pb
.name())
1139 val_type
= pb
.value().type()
1140 value
= _DecodeValue(_GetValue(pb
.value()), val_type
)
1142 if pb
.value().has_language():
1143 lang
= _DecodeUTF8(pb
.value().language())
1144 if val_type
== document_pb
.FieldValue
.TEXT
:
1145 return TextField(name
, value
, lang
)
1146 elif val_type
== document_pb
.FieldValue
.HTML
:
1147 return HtmlField(name
, value
, lang
)
1148 elif val_type
== document_pb
.FieldValue
.ATOM
:
1149 return AtomField(name
, value
, lang
)
1150 elif val_type
== document_pb
.FieldValue
.DATE
:
1151 return DateField(name
, value
)
1152 elif val_type
== document_pb
.FieldValue
.NUMBER
:
1153 return NumberField(name
, value
)
1154 elif val_type
== document_pb
.FieldValue
.GEO
:
1155 return GeoField(name
, value
)
1156 return InvalidRequest('Unknown field value type %d' % val_type
)
1159 class Document(object):
1160 """Represents a user generated document.
1162 The following example shows how to create a document consisting of a set
1163 of fields, some plain text and some in HTML.
1165 Document(doc_id='document_id',
1166 fields=[TextField(name='subject', value='going for dinner'),
1167 HtmlField(name='body',
1168 value='<html>I found a place.</html>',
1169 TextField(name='signature', value='brzydka pogoda',
1173 _FIRST_JAN_2011
= datetime
.datetime(2011, 1, 1)
1175 def __init__(self
, doc_id
=None, fields
=None, language
='en', rank
=None):
1179 doc_id: The visible printable ASCII string identifying the document which
1180 does not start with '!'. Whitespace is excluded from ids. If no id is
1181 provided, the search service will provide one.
1182 fields: An iterable of Field instances representing the content of the
1184 language: The code of the language used in the field values.
1185 rank: The rank of this document used to specify the order in which
1186 documents are returned by search. Rank must be a non-negative integer.
1187 If not specified, the number of seconds since 1st Jan 2011 is used.
1188 Documents are returned in descending order of their rank, in absence
1189 of sorting or scoring options.
1192 TypeError: If any of the parameters have invalid types, or an unknown
1193 attribute is passed.
1194 ValueError: If any of the parameters have invalid values.
1196 doc_id
= _ConvertToUnicode(doc_id
)
1197 if doc_id
is not None:
1198 _CheckDocumentId(doc_id
)
1199 self
._doc
_id
= doc_id
1200 self
._fields
= _GetList(fields
)
1201 self
._language
= _CheckLanguage(_ConvertToUnicode(language
))
1204 self
._field
_map
= None
1207 if doc_rank
is None:
1208 doc_rank
= self
._GetDefaultRank
()
1209 self
._rank
= self
._CheckRank
(doc_rank
)
1211 _CheckDocument(self
)
1215 """Returns the document identifier."""
1220 """Returns a list of fields of the document."""
1225 """Returns the code of the language the document fields are written in."""
1226 return self
._language
1230 """Returns the rank of this document."""
1233 def field(self
, field_name
):
1234 """Returns the field with the provided field name.
1237 field_name: The name of the field to return.
1240 A field with the given name.
1243 ValueError: There is not exactly one field with the given name.
1245 fields
= self
[field_name
]
1246 if len(fields
) == 1:
1249 'Must have exactly one field with name %s, but found %d.' %
1250 (field_name
, len(fields
)))
1252 def __getitem__(self
, field_name
):
1253 """Returns a list of all fields with the provided field name.
1256 field_name: The name of the field to return.
1259 All fields with the given name, or an empty list if no field with that
1262 return self
._BuildFieldMap
().get(field_name
, [])
1265 """Documents do not support iteration.
1267 This is provided to raise an explicit exception.
1269 raise TypeError('Documents do not support iteration.')
1271 def _BuildFieldMap(self
):
1272 """Lazily build the field map."""
1273 if self
._field
_map
is None:
1274 self
._field
_map
= {}
1275 for field
in self
._fields
:
1276 self
._field
_map
.setdefault(field
.name
, []).append(field
)
1277 return self
._field
_map
1279 def _CheckRank(self
, rank
):
1280 """Checks if rank is valid, then returns it."""
1281 return _CheckInteger(rank
, 'rank', upper_bound
=sys
.maxint
)
1283 def _GetDefaultRank(self
):
1284 """Returns a default rank as total seconds since 1st Jan 2011."""
1285 td
= datetime
.datetime
.now() - Document
._FIRST
_JAN
_2011
1286 return td
.seconds
+ (td
.days
* 24 * 3600)
1290 self
, [('doc_id', self
.doc_id
), ('fields', self
.fields
),
1291 ('language', self
.language
), ('rank', self
.rank
)])
1293 def __eq__(self
, other
):
1294 return (isinstance(other
, type(self
)) and self
.doc_id
== other
.doc_id
and
1295 self
.rank
== other
.rank
and self
.language
== other
.language
1296 and self
.fields
== other
.fields
)
1298 def __ne__(self
, other
):
1299 return not self
== other
1305 return hash(self
.__key
())
1311 def _CopyDocumentToProtocolBuffer(document
, pb
):
1312 """Copies Document to a document_pb.Document protocol buffer."""
1313 pb
.set_storage(document_pb
.Document
.DISK
)
1315 pb
.set_id(document
.doc_id
.encode('utf-8'))
1316 if document
.language
:
1317 pb
.set_language(document
.language
.encode('utf-8'))
1318 for field
in document
.fields
:
1319 field_pb
= pb
.add_field()
1320 _CopyFieldToProtocolBuffer(field
, field_pb
)
1321 pb
.set_order_id(document
.rank
)
1325 def _NewFieldsFromPb(field_list
):
1326 """Returns a list of Field copied from a document_pb.Document proto buf."""
1327 return [_NewFieldFromPb(f
) for f
in field_list
]
1330 def _NewDocumentFromPb(doc_pb
):
1331 """Constructs a Document from a document_pb.Document protocol buffer."""
1333 if doc_pb
.has_language():
1334 lang
= _DecodeUTF8(doc_pb
.language())
1335 return Document(doc_id
=_DecodeUTF8(doc_pb
.id()),
1336 fields
=_NewFieldsFromPb(doc_pb
.field_list()),
1338 rank
=doc_pb
.order_id())
1341 def _QuoteString(argument
):
1342 return '"' + argument
.replace('"', '\\\"') + '"'
1345 class FieldExpression(object):
1346 """Represents an expression that will be computed for each result returned.
1349 FieldExpression(name='content_snippet',
1350 expression='snippet("very important", content)')
1351 means a computed field 'content_snippet' will be returned with each search
1352 result, which contains HTML snippets of the 'content' field which match
1353 the query 'very important'.
1356 MAXIMUM_EXPRESSION_LENGTH
= 1000
1357 MAXIMUM_OPERATOR_LENGTH
= 100
1359 def __init__(self
, name
, expression
):
1363 name: The name of the computed field for the expression.
1364 expression: The expression to evaluate and return in a field with
1365 given name in results. See
1366 https://developers.google.com/appengine/docs/python/search/overview#Expressions
1367 for a list of legal expressions.
1370 TypeError: If any of the parameters has an invalid type, or an unknown
1371 attribute is passed.
1372 ValueError: If any of the parameters has an invalid value.
1373 ExpressionError: If the expression string is not parseable.
1375 self
._name
= _CheckFieldName(_ConvertToUnicode(name
))
1376 if expression
is None:
1377 raise ValueError('expression must be a FieldExpression, got None')
1378 if not isinstance(expression
, basestring
):
1379 raise TypeError('expression must be a FieldExpression, got %s' %
1380 expression
.__class
__.__name
__)
1381 self
._expression
= _CheckExpression(_ConvertToUnicode(expression
))
1385 """Returns name of the expression to return in search results."""
1389 def expression(self
):
1390 """Returns a string containing an expression returned in search results."""
1391 return self
._expression
1395 self
, [('name', self
.name
), ('expression', self
.expression
)])
1398 def _CopyFieldExpressionToProtocolBuffer(field_expression
, pb
):
1399 """Copies FieldExpression to a search_service_pb.FieldSpec_Expression."""
1400 pb
.set_name(field_expression
.name
.encode('utf-8'))
1401 pb
.set_expression(field_expression
.expression
.encode('utf-8'))
1404 class SortOptions(object):
1405 """Represents a mulit-dimensional sort of Documents.
1407 The following code shows how to sort documents based on product rating
1408 in descending order and then cheapest product within similarly rated
1409 products, sorting at most 1000 documents:
1411 SortOptions(expressions=[
1412 SortExpression(expression='rating',
1413 direction=SortExpression.DESCENDING, default_value=0),
1414 SortExpression(expression='price + tax',
1415 direction=SortExpression.ASCENDING, default_value=999999.99)],
1419 def __init__(self
, expressions
=None, match_scorer
=None, limit
=1000):
1423 expressions: An iterable of SortExpression representing a
1424 multi-dimensional sort of Documents.
1425 match_scorer: A match scorer specification which may be used to
1426 score documents or in a SortExpression combined with other features.
1427 limit: The limit on the number of documents to score or sort.
1430 TypeError: If any of the parameters has an invalid type, or an unknown
1431 attribute is passed.
1432 ValueError: If any of the parameters has an invalid value.
1434 self
._match
_scorer
= match_scorer
1435 self
._expressions
= _GetList(expressions
)
1436 for expression
in self
._expressions
:
1437 if not isinstance(expression
, SortExpression
):
1438 raise TypeError('expression must be a SortExpression, got %s' %
1439 expression
.__class
__.__name
__)
1440 self
._limit
= _CheckSortLimit(limit
)
1443 def expressions(self
):
1444 """A list of SortExpression specifying a multi-dimensional sort."""
1445 return self
._expressions
1448 def match_scorer(self
):
1449 """Returns a match scorer to score documents with."""
1450 return self
._match
_scorer
1454 """Returns the limit on the number of documents to score or sort."""
1459 self
, [('match_scorer', self
.match_scorer
),
1460 ('expressions', self
.expressions
),
1461 ('limit', self
.limit
)])
1464 class MatchScorer(object):
1465 """Assigns a document score based on term frequency.
1467 If you add a MatchScorer to a SortOptions as in the following code:
1469 sort_opts = search.SortOptions(match_scorer=search.MatchScorer())
1471 then, this will sort the documents in descending score order. The scores
1472 will be positive. If you want to sort in ascending order, then use the
1475 sort_opts = search.SortOptions(match_scorer=search.MatchScorer(),
1476 expressions=[search.SortExpression(
1477 expression='_score', direction=search.SortExpression.ASCENDING,
1478 default_value=0.0)])
1480 The scores in this case will be negative.
1487 TypeError: If any of the parameters has an invalid type, or an unknown
1488 attribute is passed.
1489 ValueError: If any of the parameters has an invalid value.
1493 return _Repr(self
, [])
1496 class RescoringMatchScorer(MatchScorer
):
1497 """Assigns a document score based on term frequency weighted by doc parts.
1499 If you add a RescoringMatchScorer to a SortOptions as in the following code:
1501 sort_opts = search.SortOptions(match_scorer=search.RescoringMatchScorer())
1503 then, this will sort the documents in descending score order. The scores
1504 will be positive. If you want to sort in ascending order, then use the
1507 sort_opts = search.SortOptions(match_scorer=search.RescoringMatchScorer(),
1508 expressions=[search.SortExpression(
1509 expression='_score', direction=search.SortExpression.ASCENDING,
1510 default_value=0.0)])
1512 The scores in this case will be negative.
1519 TypeError: If any of the parameters has an invalid type, or an unknown
1520 attribute is passed.
1521 ValueError: If any of the parameters has an invalid value.
1523 super(RescoringMatchScorer
, self
).__init
__()
1526 def _CopySortExpressionToProtocolBuffer(sort_expression
, pb
):
1527 """Copies a SortExpression to a search_service_pb.SortSpec protocol buffer."""
1528 pb
.set_sort_expression(sort_expression
.expression
.encode('utf-8'))
1529 if sort_expression
.direction
== SortExpression
.ASCENDING
:
1530 pb
.set_sort_descending(False)
1531 if isinstance(sort_expression
.default_value
, basestring
):
1532 pb
.set_default_value_text(sort_expression
.default_value
.encode('utf-8'))
1533 elif (isinstance(sort_expression
.default_value
, datetime
.datetime
) or
1534 isinstance(sort_expression
.default_value
, datetime
.date
)):
1535 pb
.set_default_value_numeric(
1536 search_util
.EpochTime(sort_expression
.default_value
))
1538 pb
.set_default_value_numeric(sort_expression
.default_value
)
1542 def _CopyMatchScorerToScorerSpecProtocolBuffer(match_scorer
, limit
, pb
):
1543 """Copies a MatchScorer to a search_service_pb.ScorerSpec."""
1544 if isinstance(match_scorer
, RescoringMatchScorer
):
1545 pb
.set_scorer(search_service_pb
.ScorerSpec
.RESCORING_MATCH_SCORER
)
1546 elif isinstance(match_scorer
, MatchScorer
):
1547 pb
.set_scorer(search_service_pb
.ScorerSpec
.MATCH_SCORER
)
1550 'match_scorer must be a MatchScorer or RescoringMatchRescorer, '
1551 'got %s' % match_scorer
.__class
__.__name
__)
1556 def _CopySortOptionsToProtocolBuffer(sort_options
, params
):
1557 """Copies the SortOptions into the SearchParams proto buf."""
1558 for expression
in sort_options
.expressions
:
1559 sort_spec_pb
= params
.add_sort_spec()
1560 _CopySortExpressionToProtocolBuffer(expression
, sort_spec_pb
)
1561 if sort_options
.match_scorer
:
1562 scorer_spec
= params
.mutable_scorer_spec()
1563 _CopyMatchScorerToScorerSpecProtocolBuffer(
1564 sort_options
.match_scorer
, sort_options
.limit
, scorer_spec
)
1565 scorer_spec
.set_limit(sort_options
.limit
)
1567 params
.mutable_scorer_spec().set_limit(sort_options
.limit
)
1570 class SortExpression(object):
1571 """Sort by a user specified scoring expression.
1573 For example, the following will sort documents on a numeric field named
1574 'length' in ascending order, assigning a default value of sys.maxint for
1575 documents which do not specify a 'length' field.
1577 SortExpression(expression='length',
1578 direction=sort.SortExpression.ASCENDING,
1579 default_value=sys.maxint)
1581 The following example will sort documents on a date field named
1582 'published_date' in descending order, assigning a default value of
1583 1999-12-31 for documents which do not specify a 'published_date' field.
1585 SortExpression(expression='published_date',
1586 default_value=datetime.date(year=1999, month=12, day=31))
1588 The following example will sort documents on a text field named 'subject'
1589 in descending order, assigning a default value of '' for documents which
1590 do not specify a 'subject' field.
1592 SortExpression(expression='subject')
1597 MAX_FIELD_VALUE
= unichr(0x10ffff) * 80
1600 MAX_FIELD_VALUE
= unichr(0xffff) * 80
1602 MIN_FIELD_VALUE
= u
''
1605 ASCENDING
, DESCENDING
= ('ASCENDING', 'DESCENDING')
1607 _DIRECTIONS
= frozenset([ASCENDING
, DESCENDING
])
1609 def __init__(self
, expression
, direction
=DESCENDING
, default_value
=''):
1613 expression: An expression to be evaluated on each matching document
1614 to sort by. The expression must evaluate to a text or numeric value.
1615 The expression can simply be a field name, or some compound expression
1616 such as "_score + count(likes) * 0.1" which will add the score from a
1617 scorer to a count of the values of a likes field times 0.1. See
1618 https://developers.google.com/appengine/docs/python/search/overview#Expressions
1619 for a list of legal expressions.
1620 direction: The direction to sort the search results, either ASCENDING
1622 default_value: The default value of the expression. The default_value is
1623 returned if expression cannot be calculated, for example, if the
1624 expression is a field name and no value for that named field exists.
1625 A text value must be specified for text sorts. A numeric value must be
1626 specified for numeric sorts. A date value must be specified for date
1630 TypeError: If any of the parameters has an invalid type, or an unknown
1631 attribute is passed.
1632 ValueError: If any of the parameters has an invalid value.
1633 ExpressionError: If the expression string is not parseable.
1635 self
._expression
= _ConvertToUnicode(expression
)
1636 self
._direction
= self
._CheckDirection
(direction
)
1637 if self
._expression
is None:
1638 raise TypeError('expression must be a SortExpression, got None')
1639 _CheckExpression(self
._expression
)
1640 self
._default
_value
= default_value
1641 if isinstance(self
.default_value
, basestring
):
1642 self
._default
_value
= _ConvertToUnicode(default_value
)
1643 _CheckText(self
._default
_value
, 'default_value')
1644 elif not isinstance(self
._default
_value
,
1645 (int, long, float, datetime
.date
, datetime
.datetime
)):
1646 raise TypeError('default_value must be text, numeric or datetime, got %s'
1647 % self
._default
_value
.__class
__.__name
__)
1650 def expression(self
):
1651 """Returns the expression to sort by."""
1652 return self
._expression
1655 def direction(self
):
1656 """Returns the direction to sort expression: ASCENDING or DESCENDING."""
1657 return self
._direction
1660 def default_value(self
):
1661 """Returns a default value for the expression if no value computed."""
1662 return self
._default
_value
1664 def _CheckDirection(self
, direction
):
1665 """Checks direction is a valid SortExpression direction and returns it."""
1666 return _CheckEnum(direction
, 'direction', values
=self
._DIRECTIONS
)
1670 self
, [('expression', self
.expression
),
1671 ('direction', self
.direction
),
1672 ('default_value', self
.default_value
)])
1675 class ScoredDocument(Document
):
1676 """Represents a scored document returned from a search."""
1678 def __init__(self
, doc_id
=None, fields
=None, language
='en',
1679 sort_scores
=None, expressions
=None, cursor
=None, rank
=None):
1683 doc_id: The visible printable ASCII string identifying the document which
1684 does not start with '!'. Whitespace is excluded from ids. If no id is
1685 provided, the search service will provide one.
1686 fields: An iterable of Field instances representing the content of the
1688 language: The code of the language used in the field values.
1689 sort_scores: The list of scores assigned during sort evaluation. Each
1690 sort dimension is included. Positive scores are used for ascending
1691 sorts; negative scores for descending.
1692 expressions: The list of computed fields which are the result of
1693 expressions requested.
1694 cursor: A cursor associated with the document.
1695 rank: The rank of this document. A rank must be a non-negative integer
1696 less than sys.maxint. If not specified, the number of seconds since
1697 1st Jan 2011 is used. Documents are returned in descending order of
1701 TypeError: If any of the parameters have invalid types, or an unknown
1702 attribute is passed.
1703 ValueError: If any of the parameters have invalid values.
1705 super(ScoredDocument
, self
).__init
__(doc_id
=doc_id
, fields
=fields
,
1706 language
=language
, rank
=rank
)
1707 self
._sort
_scores
= self
._CheckSortScores
(_GetList(sort_scores
))
1708 self
._expressions
= _GetList(expressions
)
1709 if cursor
is not None and not isinstance(cursor
, Cursor
):
1710 raise TypeError('cursor must be a Cursor, got %s' %
1711 cursor
.__class
__.__name
__)
1712 self
._cursor
= cursor
1715 def sort_scores(self
):
1716 """The list of scores assigned during sort evaluation.
1718 Each sort dimension is included. Positive scores are used for ascending
1719 sorts; negative scores for descending.
1722 The list of numeric sort scores.
1724 return self
._sort
_scores
1727 def expressions(self
):
1728 """The list of computed fields the result of expression evaluation.
1730 For example, if a request has
1731 FieldExpression(name='snippet', 'snippet("good story", content)')
1732 meaning to compute a snippet field containing HTML snippets extracted
1733 from the matching of the query 'good story' on the field 'content'.
1734 This means a field such as the following will be returned in expressions
1735 for the search result:
1736 HtmlField(name='snippet', value='that was a <b>good story</b> to finish')
1739 The computed fields.
1741 return self
._expressions
1745 """A cursor associated with a result, a continued search starting point.
1747 To get this cursor to appear, set the Index.cursor_type to
1748 Index.RESULT_CURSOR, otherwise this will be None.
1755 def _CheckSortScores(self
, sort_scores
):
1756 """Checks sort_scores is a list of floats, and returns it."""
1757 for sort_score
in sort_scores
:
1758 _CheckNumber(sort_score
, 'sort_scores')
1762 return _Repr(self
, [('doc_id', self
.doc_id
),
1763 ('fields', self
.fields
),
1764 ('language', self
.language
),
1765 ('rank', self
.rank
),
1766 ('sort_scores', self
.sort_scores
),
1767 ('expressions', self
.expressions
),
1768 ('cursor', self
.cursor
)])
1771 class SearchResults(object):
1772 """Represents the result of executing a search request."""
1774 def __init__(self
, number_found
, results
=None, cursor
=None):
1778 number_found: The number of documents found for the query.
1779 results: The list of ScoredDocuments returned from executing a
1781 cursor: A Cursor to continue the search from the end of the
1785 TypeError: If any of the parameters have an invalid type, or an unknown
1786 attribute is passed.
1787 ValueError: If any of the parameters have an invalid value.
1789 self
._number
_found
= _CheckInteger(number_found
, 'number_found')
1790 self
._results
= _GetList(results
)
1791 if cursor
is not None and not isinstance(cursor
, Cursor
):
1792 raise TypeError('cursor must be a Cursor, got %s' %
1793 cursor
.__class
__.__name
__)
1794 self
._cursor
= cursor
1798 for result
in self
.results
:
1803 """Returns the list of ScoredDocuments that matched the query."""
1804 return self
._results
1807 def number_found(self
):
1808 """Returns the number of documents which were found for the search.
1810 Note that this is an approximation and not an exact count.
1811 If QueryOptions.number_found_accuracy parameter is set to 100
1812 for example, then number_found <= 100 is accurate.
1815 The number of documents found.
1817 return self
._number
_found
1821 """Returns a cursor that can be used to continue search from last result.
1823 This corresponds to using a ResultsCursor in QueryOptions,
1824 otherwise this will be None.
1832 return _Repr(self
, [('results', self
.results
),
1833 ('number_found', self
.number_found
),
1834 ('cursor', self
.cursor
)])
1837 class GetResponse(object):
1838 """Represents the result of executing a get request.
1840 For example, the following code shows how a response could be used
1841 to determine which documents were successfully removed or not.
1843 response = index.get_range()
1844 for document in response:
1845 print "document ", document
1848 def __init__(self
, results
=None):
1852 results: The results returned from an index ordered by Id.
1855 TypeError: If any of the parameters have an invalid type, or an unknown
1856 attribute is passed.
1857 ValueError: If any of the parameters have an invalid value.
1859 self
._results
= _GetList(results
)
1862 for result
in self
.results
:
1867 """Returns a list of results ordered by Id from the index."""
1868 return self
._results
1871 return _Repr(self
, [('results', self
.results
)])
1874 class Cursor(object):
1875 """Specifies how to get the next page of results in a search.
1877 A cursor returned in a previous set of search results to use as a starting
1878 point to retrieve the next set of results. This can get you better
1879 performance, and also improves the consistency of pagination through index
1882 The following shows how to use the cursor to get the next page of results:
1884 # get the first set of results; the first cursor is used to specify
1885 # that cursors are to be returned in the SearchResults.
1886 results = index.search(Query(query_string='some stuff',
1887 QueryOptions(cursor=Cursor()))
1889 # get the next set of results
1890 results = index.search(Query(query_string='some stuff',
1891 QueryOptions(cursor=results.cursor)))
1893 If you want to continue search from any one of the ScoredDocuments in
1894 SearchResults, then you can set Cursor.per_result to True.
1896 # get the first set of results; the first cursor is used to specify
1897 # that cursors are to be returned in the SearchResults.
1898 results = index.search(Query(query_string='some stuff',
1899 QueryOptions(cursor=Cursor(per_result=True)))
1901 # this shows how to access the per_document cursors returned from a search
1902 per_document_cursor = None
1903 for scored_document in results:
1904 per_document_cursor = scored_document.cursor
1906 # get the next set of results
1907 results = index.search(Query(query_string='some stuff',
1908 QueryOptions(cursor=per_document_cursor)))
1913 def __init__(self
, web_safe_string
=None, per_result
=False):
1917 web_safe_string: The cursor string returned from the search service to
1918 be interpreted by the search service to get the next set of results.
1919 per_result: A bool when true will return a cursor per ScoredDocument in
1920 SearchResults, otherwise will return a single cursor for the whole
1921 SearchResults. If using offset this is ignored, as the user is
1922 responsible for calculating a next offset if any.
1925 ValueError: if the web_safe_string is not of required format.
1927 self
._web
_safe
_string
= _CheckCursor(_ConvertToUnicode(web_safe_string
))
1928 self
._per
_result
= per_result
1929 if self
._web
_safe
_string
:
1930 parts
= self
._web
_safe
_string
.split(':', 1)
1931 if len(parts
) != 2 or parts
[0] not in ['True', 'False']:
1932 raise ValueError('invalid format for web_safe_string, got %s' %
1933 self
._web
_safe
_string
)
1934 self
._internal
_cursor
= parts
[1]
1936 self
._per
_result
= (parts
[0] == 'True')
1939 def web_safe_string(self
):
1940 """Returns the cursor string generated by the search service."""
1941 return self
._web
_safe
_string
1944 def per_result(self
):
1945 """Returns whether to return a cursor for each ScoredDocument in results."""
1946 return self
._per
_result
1949 return _Repr(self
, [('web_safe_string', self
.web_safe_string
)])
1952 def _ToWebSafeString(per_result
, internal_cursor
):
1953 """Returns the web safe string combining per_result with internal cursor."""
1954 return str(per_result
) + ':' + internal_cursor
1957 def _CheckQuery(query
):
1958 """Checks a query is a valid query string."""
1959 _ValidateString(query
, 'query', MAXIMUM_QUERY_LENGTH
, empty_ok
=True)
1961 raise TypeError('query must be unicode, got None')
1964 query_parser
.Parse(query
)
1965 except query_parser
.QueryException
, e
:
1966 raise QueryError('Failed to parse query "%s"' % query
)
1970 def _CheckLimit(limit
):
1971 """Checks the limit of documents to return is an integer within range."""
1972 return _CheckInteger(
1973 limit
, 'limit', zero_ok
=False,
1974 upper_bound
=MAXIMUM_DOCUMENTS_RETURNED_PER_SEARCH
)
1977 def _CheckOffset(offset
):
1978 """Checks the offset in document list is an integer within range."""
1979 return _CheckInteger(
1980 offset
, 'offset', zero_ok
=True,
1981 upper_bound
=MAXIMUM_SEARCH_OFFSET
)
1984 def _CheckNumberFoundAccuracy(number_found_accuracy
):
1985 """Checks the accuracy is an integer within range."""
1986 return _CheckInteger(
1987 number_found_accuracy
, 'number_found_accuracy',
1988 zero_ok
=False, upper_bound
=MAXIMUM_NUMBER_FOUND_ACCURACY
)
1991 def _CheckCursor(cursor
):
1992 """Checks the cursor if specified is a string which is not too long."""
1993 return _ValidateString(cursor
, 'cursor', _MAXIMUM_CURSOR_LENGTH
,
1997 def _CheckNumberOfFields(returned_expressions
, snippeted_fields
,
1999 """Checks the count of all field kinds is less than limit."""
2000 number_expressions
= (len(returned_expressions
) + len(snippeted_fields
) +
2001 len(returned_fields
))
2002 if number_expressions
> MAXIMUM_FIELDS_RETURNED_PER_SEARCH
:
2004 'too many fields, snippets or expressions to return %d > maximum %d'
2005 % (number_expressions
, MAXIMUM_FIELDS_RETURNED_PER_SEARCH
))
2008 class QueryOptions(object):
2009 """Options for post-processing results for a query.
2011 Options include the ability to sort results, control which document fields
2012 to return, produce snippets of fields and compute and sort by complex
2013 scoring expressions.
2015 If you wish to randomly access pages of search results, you can use an
2018 # get the first set of results
2020 results = index.search(Query(query_string='some stuff',
2021 QueryOptions(limit=page_size))
2024 pages = results.found_count / page_size
2026 # user chooses page and hence an offset into results
2027 next_page = ith * page_size
2029 # get the search results for that page
2030 results = index.search(Query(query_string='some stuff',
2031 QueryOptions(limit=page_size, offset=next_page))
2034 def __init__(self
, limit
=20, number_found_accuracy
=None, cursor
=None,
2035 offset
=None, sort_options
=None, returned_fields
=None,
2036 ids_only
=False, snippeted_fields
=None,
2037 returned_expressions
=None):
2042 For example, the following code fragment requests a search for
2043 documents where 'first' occurs in subject and 'good' occurs anywhere,
2044 returning at most 20 documents, starting the search from 'cursor token',
2045 returning another single cursor for the SearchResults, sorting by subject in
2046 descending order, returning the author, subject, and summary fields as well
2047 as a snippeted field content.
2049 results = index.search(Query(
2050 query='subject:first good',
2051 options=QueryOptions(
2054 sort_options=SortOptions(
2056 SortExpression(expression='subject')],
2058 returned_fields=['author', 'subject', 'summary'],
2059 snippeted_fields=['content'])))
2062 limit: The limit on number of documents to return in results.
2063 number_found_accuracy: The minimum accuracy requirement for
2064 SearchResults.number_found. If set, the number_found will be
2065 accurate up to at least that number. For example, when set to 100,
2066 any SearchResults with number_found <= 100 is accurate. This option
2067 may add considerable latency/expense, especially when used with
2069 cursor: A Cursor describing where to get the next set of results,
2070 or to provide next cursors in SearchResults.
2071 offset: The offset is number of documents to skip in search results. This
2072 is an alternative to using a query cursor, but allows random access into
2073 the results. Using offsets rather than cursors are more expensive. You
2074 can only use either cursor or offset, but not both. Using an offset
2075 means that no cursor is returned in SearchResults.cursor, nor in each
2076 ScoredDocument.cursor.
2077 sort_options: A SortOptions specifying a multi-dimensional sort over
2079 returned_fields: An iterable of names of fields to return in search
2081 ids_only: Only return document ids, do not return any fields.
2082 snippeted_fields: An iterable of names of fields to snippet and return
2083 in search result expressions.
2084 returned_expressions: An iterable of FieldExpression to evaluate and
2085 return in search results.
2087 TypeError: If an unknown iterator_options or sort_options is passed.
2088 ValueError: If ids_only and returned_fields are used together.
2089 ExpressionError: If one of the returned expression strings is not
2092 self
._limit
= _CheckLimit(limit
)
2093 self
._number
_found
_accuracy
= _CheckNumberFoundAccuracy(
2094 number_found_accuracy
)
2095 if cursor
is not None and not isinstance(cursor
, Cursor
):
2096 raise TypeError('cursor must be a Cursor, got %s' %
2097 cursor
.__class
__.__name
__)
2098 if cursor
is not None and offset
is not None:
2099 raise ValueError('cannot set cursor and offset together')
2100 self
._cursor
= cursor
2101 self
._offset
= _CheckOffset(offset
)
2102 if sort_options
is not None and not isinstance(sort_options
, SortOptions
):
2103 raise TypeError('sort_options must be a SortOptions, got %s' %
2104 sort_options
.__class
__.__name
__)
2105 self
._sort
_options
= sort_options
2107 self
._returned
_fields
= _ConvertToUnicodeList(returned_fields
)
2108 _CheckFieldNames(self
._returned
_fields
)
2109 self
._ids
_only
= ids_only
2110 if self
._ids
_only
and self
._returned
_fields
:
2111 raise ValueError('cannot have ids_only and returned_fields set together')
2112 self
._snippeted
_fields
= _ConvertToUnicodeList(snippeted_fields
)
2113 _CheckFieldNames(self
._snippeted
_fields
)
2114 self
._returned
_expressions
= _ConvertToList(returned_expressions
)
2115 for expression
in self
._returned
_expressions
:
2116 _CheckFieldName(_ConvertToUnicode(expression
.name
))
2117 _CheckExpression(_ConvertToUnicode(expression
.expression
))
2118 _CheckNumberOfFields(self
._returned
_expressions
, self
._snippeted
_fields
,
2119 self
._returned
_fields
)
2123 """Returns a limit on number of documents to return in results."""
2127 def number_found_accuracy(self
):
2128 """Returns minimum accuracy requirement for SearchResults.number_found."""
2129 return self
._number
_found
_accuracy
2133 """Returns the Cursor for the query."""
2138 """Returns the number of documents in search results to skip."""
2142 def sort_options(self
):
2143 """Returns a SortOptions."""
2144 return self
._sort
_options
2147 def returned_fields(self
):
2148 """Returns an iterable of names of fields to return in search results."""
2149 return self
._returned
_fields
2153 """Returns whether to return only document ids in search results."""
2154 return self
._ids
_only
2157 def snippeted_fields(self
):
2158 """Returns iterable of field names to snippet and return in results."""
2159 return self
._snippeted
_fields
2162 def returned_expressions(self
):
2163 """Returns iterable of FieldExpression to return in results."""
2164 return self
._returned
_expressions
2167 return _Repr(self
, [('limit', self
.limit
),
2168 ('number_found_accuracy', self
.number_found_accuracy
),
2169 ('cursor', self
.cursor
),
2170 ('sort_options', self
.sort_options
),
2171 ('returned_fields', self
.returned_fields
),
2172 ('ids_only', self
.ids_only
),
2173 ('snippeted_fields', self
.snippeted_fields
),
2174 ('returned_expressions', self
.returned_expressions
)])
2177 def _CopyQueryOptionsObjectToProtocolBuffer(query
, options
, params
):
2178 """Copies a QueryOptions object to a SearchParams proto buff."""
2180 web_safe_string
= None
2182 offset
= options
.offset
2184 cursor
= options
.cursor
2185 if cursor
.per_result
:
2186 cursor_type
= search_service_pb
.SearchParams
.PER_RESULT
2188 cursor_type
= search_service_pb
.SearchParams
.SINGLE
2189 if isinstance(cursor
, Cursor
) and cursor
.web_safe_string
:
2190 web_safe_string
= cursor
._internal
_cursor
2191 _CopyQueryOptionsToProtocolBuffer(
2192 query
, offset
, options
.limit
, options
.number_found_accuracy
,
2193 web_safe_string
, cursor_type
, options
.ids_only
, options
.returned_fields
,
2194 options
.snippeted_fields
, options
.returned_expressions
,
2195 options
.sort_options
, params
)
2198 def _CopyQueryOptionsToProtocolBuffer(
2199 query
, offset
, limit
, number_found_accuracy
, cursor
, cursor_type
, ids_only
,
2200 returned_fields
, snippeted_fields
, returned_expressions
, sort_options
,
2202 """Copies fields of QueryOptions to params protobuf."""
2204 params
.set_offset(offset
)
2205 params
.set_limit(limit
)
2206 if number_found_accuracy
is not None:
2207 params
.set_matched_count_accuracy(number_found_accuracy
)
2209 params
.set_cursor(cursor
.encode('utf-8'))
2210 if cursor_type
is not None:
2211 params
.set_cursor_type(cursor_type
)
2213 params
.set_keys_only(ids_only
)
2214 if returned_fields
or snippeted_fields
or returned_expressions
:
2215 field_spec_pb
= params
.mutable_field_spec()
2216 for field
in returned_fields
:
2217 field_spec_pb
.add_name(field
.encode('utf-8'))
2218 for snippeted_field
in snippeted_fields
:
2219 expression
= u
'snippet(%s, %s)' % (_QuoteString(query
), snippeted_field
)
2220 _CopyFieldExpressionToProtocolBuffer(
2222 name
=snippeted_field
, expression
=expression
.encode('utf-8')),
2223 field_spec_pb
.add_expression())
2224 for expression
in returned_expressions
:
2225 _CopyFieldExpressionToProtocolBuffer(
2226 expression
, field_spec_pb
.add_expression())
2228 if sort_options
is not None:
2229 _CopySortOptionsToProtocolBuffer(sort_options
, params
)
2232 class Query(object):
2233 """Represents a request on the search service to query the index."""
2235 def __init__(self
, query_string
, options
=None):
2241 For example, the following code fragment requests a search for
2242 documents where 'first' occurs in subject and 'good' occurs anywhere,
2243 returning at most 20 documents, starting the search from 'cursor token',
2244 returning another single document cursor for the results, sorting by
2245 subject in descending order, returning the author, subject, and summary
2246 fields as well as a snippeted field content.
2248 results = index.search(Query(
2249 query_string='subject:first good',
2250 options=QueryOptions(
2253 sort_options=SortOptions(
2255 SortExpression(expression='subject')],
2257 returned_fields=['author', 'subject', 'summary'],
2258 snippeted_fields=['content'])))
2260 In order to get a Cursor, you specify a Cursor in QueryOptions.cursor
2261 and extract the Cursor for the next request from results.cursor to
2262 continue from the last found document, as shown below:
2264 results = index.search(
2265 Query(query_string='subject:first good',
2266 options=QueryOptions(cursor=results.cursor)))
2269 query_string: The query to match against documents in the index. A query
2270 is a boolean expression containing terms. For example, the query
2271 'job tag:"very important" sent <= 2011-02-28'
2272 finds documents with the term job in any field, that contain the
2273 phrase "very important" in a tag field, and a sent date up to and
2274 including 28th February, 2011. You can use combinations of
2275 '(cat OR feline) food NOT dog'
2276 to find documents which contain the term cat or feline as well as food,
2277 but do not mention the term dog. A further example,
2278 'category:televisions brand:sony price >= 300 price < 400'
2279 will return documents which have televisions in a category field, a
2280 sony brand and a price field which is 300 (inclusive) to 400
2282 https://developers.google.com/appengine/docs/python/search/overview#Expressions
2283 for a list of expressions that can be used in queries.
2284 options: A QueryOptions describing post-processing of search results.
2286 QueryError: If the query string is not parseable.
2288 self
._query
_string
= _ConvertToUnicode(query_string
)
2289 _CheckQuery(self
._query
_string
)
2290 self
._options
= options
2293 def query_string(self
):
2294 """Returns the query string to be applied to search service."""
2295 return self
._query
_string
2299 """Returns QueryOptions defining post-processing on the search results."""
2300 return self
._options
2303 def _CopyQueryToProtocolBuffer(query
, params
):
2304 """Copies Query object to params protobuf."""
2305 params
.set_query(query
.encode('utf-8'))
2308 def _CopyQueryObjectToProtocolBuffer(query
, params
):
2309 _CopyQueryToProtocolBuffer(query
.query_string
, params
)
2310 options
= query
.options
2311 if query
.options
is None:
2312 options
= QueryOptions()
2313 _CopyQueryOptionsObjectToProtocolBuffer(query
.query_string
, options
, params
)
2316 class Index(object):
2317 """Represents an index allowing indexing, deleting and searching documents.
2319 The following code fragment shows how to add documents, then search the
2320 index for documents matching a query.
2323 index = Index(name='index-name')
2325 # Create a document.
2326 doc = Document(doc_id='document-id',
2327 fields=[TextField(name='subject', value='my first email'),
2328 HtmlField(name='body',
2329 value='<html>some content here</html>')])
2331 # Index the document.
2334 except search.Error, e:
2335 # possibly retry indexing or log error
2339 results = index.search('subject:first body:here')
2341 # Iterate through the search results.
2342 for scored_document in results:
2343 print scored_document
2345 except search.Error, e:
2346 # possibly log the failure
2348 Once an index is created with a given specification, that specification is
2351 Search results may contain some out of date documents. However, any two
2352 changes to any document stored in an index are applied in the correct order.
2357 RESPONSE_CURSOR
, RESULT_CURSOR
= ('RESPONSE_CURSOR', 'RESULT_CURSOR')
2359 _CURSOR_TYPES
= frozenset([RESPONSE_CURSOR
, RESULT_CURSOR
])
2361 SEARCH
, DATASTORE
, CLOUD_STORAGE
= ('SEARCH', 'DATASTORE', 'CLOUD_STORAGE')
2363 _SOURCES
= frozenset([SEARCH
, DATASTORE
, CLOUD_STORAGE
])
2365 def __init__(self
, name
, namespace
=None, source
=SEARCH
):
2369 name: The name of the index. An index name must be a visible printable
2370 ASCII string not starting with '!'. Whitespace characters are excluded.
2371 namespace: The namespace of the index name. If not set, then the current
2373 source: Deprecated as of 1.7.6. The source of
2375 SEARCH - The Index was created by adding documents throught this
2377 DATASTORE - The Index was created as a side-effect of putting entities
2379 CLOUD_STORAGE - The Index was created as a side-effect of adding
2380 objects into a Cloud Storage bucket.
2382 TypeError: If an unknown attribute is passed.
2383 ValueError: If invalid namespace is given.
2385 if source
not in self
._SOURCES
:
2386 raise ValueError('source must be one of %s' % self
._SOURCES
)
2387 if source
is not self
.SEARCH
:
2388 warnings
.warn('source is deprecated.', DeprecationWarning, stacklevel
=2)
2389 self
._source
= source
2390 self
._name
= _CheckIndexName(_ConvertToUnicode(name
))
2391 self
._namespace
= _ConvertToUnicode(namespace
)
2392 if self
._namespace
is None:
2393 self
._namespace
= _ConvertToUnicode(namespace_manager
.get_namespace())
2394 if self
._namespace
is None:
2395 self
._namespace
= u
''
2396 namespace_manager
.validate_namespace(self
._namespace
, exception
=ValueError)
2398 self
._storage
_usage
= None
2399 self
._storage
_limit
= None
2403 """Returns the schema mapping field names to list of types supported.
2405 Only valid for Indexes returned by search.get_indexes method."""
2409 def storage_usage(self
):
2410 """The approximate number of bytes used by this index.
2412 The number may be slightly stale, as it may not reflect the
2413 results of recent changes.
2415 Returns None for indexes not obtained from search.get_indexes.
2418 return self
._storage
_usage
2421 def storage_limit(self
):
2422 """The maximum allowable storage for this index, in bytes.
2424 Returns None for indexes not obtained from search.get_indexes."""
2425 return self
._storage
_limit
2429 """Returns the name of the index."""
2433 def namespace(self
):
2434 """Returns the namespace of the name of the index."""
2435 return self
._namespace
2439 """Returns the source of the index.
2441 Deprecated: from 1.7.6, source is no longer available."""
2442 warnings
.warn('source is deprecated.', DeprecationWarning, stacklevel
=2)
2445 def __eq__(self
, other
):
2446 return (isinstance(other
, self
.__class
__)
2447 and self
.__dict
__ == other
.__dict
__)
2449 def __ne__(self
, other
):
2450 return not self
.__eq
__(other
)
2453 return hash((self
._name
, self
._namespace
))
2457 return _Repr(self
, [('name', self
.name
), ('namespace', self
.namespace
),
2458 ('source', self
._source
),
2459 ('schema', self
.schema
),
2460 ('storage_usage', self
.storage_usage
),
2461 ('storage_limit', self
.storage_limit
)])
2463 def _NewPutResultFromPb(self
, status_pb
, doc_id
):
2464 """Constructs PutResult from RequestStatus pb and doc_id."""
2466 if status_pb
.has_error_detail():
2467 message
= _DecodeUTF8(status_pb
.error_detail())
2468 code
= _ERROR_OPERATION_CODE_MAP
.get(status_pb
.code(),
2469 OperationResult
.INTERNAL_ERROR
)
2470 return PutResult(code
=code
, message
=message
, id=_DecodeUTF8(doc_id
))
2472 def _NewPutResultList(self
, response
):
2473 return [self
._NewPutResultFromPb
(status
, doc_id
)
2474 for status
, doc_id
in zip(response
.status_list(),
2475 response
.doc_id_list())]
2477 @datastore_rpc._positional
(2)
2478 def put(self
, documents
, deadline
=None):
2479 """Index the collection of documents.
2481 If any of the documents are already in the index, then reindex them with
2482 their corresponding fresh document.
2485 documents: A Document or iterable of Documents to index.
2488 deadline: Deadline for RPC call in seconds; if None use the default.
2491 A list of PutResult, one per Document requested to be indexed.
2494 PutError: If one or more documents failed to index or
2495 number indexed did not match requested.
2496 TypeError: If an unknown attribute is passed.
2497 ValueError: If documents is not a Document or iterable of Document
2498 or number of the documents is larger than
2499 MAXIMUM_DOCUMENTS_PER_PUT_REQUEST or deadline is a negative number.
2502 if isinstance(documents
, basestring
):
2503 raise TypeError('documents must be a Document or sequence of '
2504 'Documents, got %s' % documents
.__class
__.__name
__)
2506 docs
= list(iter(documents
))
2513 if len(docs
) > MAXIMUM_DOCUMENTS_PER_PUT_REQUEST
:
2514 raise ValueError('too many documents to index')
2516 request
= search_service_pb
.IndexDocumentRequest()
2517 response
= search_service_pb
.IndexDocumentResponse()
2519 params
= request
.mutable_params()
2520 _CopyMetadataToProtocolBuffer(self
, params
.mutable_index_spec())
2523 for document
in docs
:
2524 doc_id
= document
.doc_id
2526 if doc_id
in seen_docs
:
2527 if document
!= seen_docs
[doc_id
]:
2529 'Different documents with the same ID found in the '
2530 'same call to Index.put()')
2534 seen_docs
[doc_id
] = document
2535 doc_pb
= params
.add_document()
2536 _CopyDocumentToProtocolBuffer(document
, doc_pb
)
2538 _MakeSyncSearchServiceCall('IndexDocument', request
, response
, deadline
)
2540 results
= self
._NewPutResultList
(response
)
2542 if response
.status_size() != len(params
.document_list()):
2543 raise PutError('did not index requested number of documents', results
)
2545 for status
in response
.status_list():
2546 if status
.code() != search_service_pb
.SearchServiceError
.OK
:
2548 _ConcatenateErrorMessages(
2549 'one or more put document operations failed', status
), results
)
2552 def _NewDeleteResultFromPb(self
, status_pb
, doc_id
):
2553 """Constructs DeleteResult from RequestStatus pb and doc_id."""
2555 if status_pb
.has_error_detail():
2556 message
= _DecodeUTF8(status_pb
.error_detail())
2557 code
= _ERROR_OPERATION_CODE_MAP
.get(status_pb
.code(),
2558 OperationResult
.INTERNAL_ERROR
)
2560 return DeleteResult(code
=code
, message
=message
, id=doc_id
)
2562 def _NewDeleteResultList(self
, document_ids
, response
):
2563 return [self
._NewDeleteResultFromPb
(status
, doc_id
)
2564 for status
, doc_id
in zip(response
.status_list(), document_ids
)]
2566 @datastore_rpc._positional
(2)
2567 def delete(self
, document_ids
, deadline
=None):
2568 """Delete the documents with the corresponding document ids from the index.
2570 If no document exists for the identifier in the list, then that document
2571 identifier is ignored.
2574 document_ids: A single identifier or list of identifiers of documents
2578 deadline: Deadline for RPC call in seconds; if None use the default.
2581 DeleteError: If one or more documents failed to remove or
2582 number removed did not match requested.
2583 ValueError: If document_ids is not a string or iterable of valid document
2584 identifiers or number of document ids is larger than
2585 MAXIMUM_DOCUMENTS_PER_PUT_REQUEST or deadline is a negative number.
2587 doc_ids
= _ConvertToList(document_ids
)
2591 if len(doc_ids
) > MAXIMUM_DOCUMENTS_PER_PUT_REQUEST
:
2592 raise ValueError('too many documents to delete')
2594 request
= search_service_pb
.DeleteDocumentRequest()
2595 response
= search_service_pb
.DeleteDocumentResponse()
2596 params
= request
.mutable_params()
2597 _CopyMetadataToProtocolBuffer(self
, params
.mutable_index_spec())
2598 for document_id
in doc_ids
:
2599 _CheckDocumentId(document_id
)
2600 params
.add_doc_id(document_id
)
2602 _MakeSyncSearchServiceCall('DeleteDocument', request
, response
,
2605 results
= self
._NewDeleteResultList
(doc_ids
, response
)
2607 if response
.status_size() != len(doc_ids
):
2609 'did not delete requested number of documents', results
)
2611 for status
in response
.status_list():
2612 if status
.code() != search_service_pb
.SearchServiceError
.OK
:
2614 _ConcatenateErrorMessages(
2615 'one or more delete document operations failed', status
),
2618 def delete_schema(self
):
2619 """Deprecated in 1.7.4. Delete the schema from the index.
2621 We are deprecating this method and replacing with more general schema
2622 and index managment.
2624 A possible use may be remove typed fields which are no longer used. After
2625 you delete the schema, you need to index one or more documents to rebuild
2626 the schema. Until you re-index some documents, searches may fail, especially
2627 searches using field restricts.
2630 DeleteError: If the schema failed to be deleted.
2632 warnings
.warn('delete_schema is deprecated in 1.7.4.',
2633 DeprecationWarning, stacklevel
=2)
2634 request
= search_service_pb
.DeleteSchemaRequest()
2635 response
= search_service_pb
.DeleteSchemaResponse()
2636 params
= request
.mutable_params()
2637 _CopyMetadataToProtocolBuffer(self
, params
.add_index_spec())
2639 _MakeSyncSearchServiceCall('DeleteSchema', request
, response
, None)
2641 results
= self
._NewDeleteResultList
([self
.name
], response
)
2643 if response
.status_size() != 1:
2644 raise DeleteError('did not delete exactly one schema', results
)
2646 status
= response
.status_list()[0]
2647 if status
.code() != search_service_pb
.SearchServiceError
.OK
:
2649 _ConcatenateErrorMessages('delete schema operation failed', status
),
2652 def _NewScoredDocumentFromPb(self
, doc_pb
, sort_scores
, expressions
, cursor
):
2653 """Constructs a Document from a document_pb.Document protocol buffer."""
2655 if doc_pb
.has_language():
2656 lang
= _DecodeUTF8(doc_pb
.language())
2657 return ScoredDocument(
2658 doc_id
=_DecodeUTF8(doc_pb
.id()),
2659 fields
=_NewFieldsFromPb(doc_pb
.field_list()),
2660 language
=lang
, rank
=doc_pb
.order_id(), sort_scores
=sort_scores
,
2661 expressions
=_NewFieldsFromPb(expressions
), cursor
=cursor
)
2663 def _NewSearchResults(self
, response
, cursor
):
2664 """Returns a SearchResults populated from a search_service response pb."""
2666 for result_pb
in response
.result_list():
2667 per_result_cursor
= None
2668 if result_pb
.has_cursor():
2669 if isinstance(cursor
, Cursor
):
2671 per_result_cursor
= Cursor(web_safe_string
=_ToWebSafeString(
2672 cursor
.per_result
, _DecodeUTF8(result_pb
.cursor())))
2674 self
._NewScoredDocumentFromPb
(
2675 result_pb
.document(), result_pb
.score_list(),
2676 result_pb
.expression_list(), per_result_cursor
))
2677 results_cursor
= None
2678 if response
.has_cursor():
2679 if isinstance(cursor
, Cursor
):
2681 results_cursor
= Cursor(web_safe_string
=_ToWebSafeString(
2682 cursor
.per_result
, _DecodeUTF8(response
.cursor())))
2683 return SearchResults(
2684 results
=results
, number_found
=response
.matched_count(),
2685 cursor
=results_cursor
)
2687 @datastore_rpc._positional
(2)
2688 def get(self
, doc_id
, deadline
=None):
2689 """Retrieve a document by document ID.
2692 doc_id: The ID of the document to retreive.
2695 deadline: Deadline for RPC call in seconds; if None use the default.
2698 If the document ID exists, returns the associated document. Otherwise,
2702 TypeError: If any of the parameters have invalid types, or an unknown
2703 attribute is passed.
2704 ValueError: If any of the parameters have invalid values (e.g., a
2707 response
= self
.get_range(start_id
=doc_id
, limit
=1, deadline
=deadline
)
2708 if response
.results
and response
.results
[0].doc_id
== doc_id
:
2709 return response
.results
[0]
2712 @datastore_rpc._positional
(2)
2713 def search(self
, query
, deadline
=None, **kwargs
):
2714 """Search the index for documents matching the query.
2716 For example, the following code fragment requests a search for
2717 documents where 'first' occurs in subject and 'good' occurs anywhere,
2718 returning at most 20 documents, starting the search from 'cursor token',
2719 returning another single cursor for the response, sorting by subject in
2720 descending order, returning the author, subject, and summary fields as well
2721 as a snippeted field content.
2723 results = index.search(
2724 query=Query('subject:first good',
2725 options=QueryOptions(limit=20,
2727 sort_options=SortOptions(
2728 expressions=[SortExpression(expression='subject')],
2730 returned_fields=['author', 'subject', 'summary'],
2731 snippeted_fields=['content'])))
2733 The following code fragment shows how to use a results cursor
2735 cursor = results.cursor
2736 for result in response:
2739 results = index.search(
2740 Query('subject:first good', options=QueryOptions(cursor=cursor)))
2742 The following code fragment shows how to use a per_result cursor
2744 results = index.search(
2745 query=Query('subject:first good',
2746 options=QueryOptions(limit=20,
2747 cursor=Cursor(per_result=True),
2751 for result in results:
2752 cursor = result.cursor
2754 results = index.search(
2755 Query('subject:first good', options=QueryOptions(cursor=cursor)))
2758 query: The Query to match against documents in the index.
2761 deadline: Deadline for RPC call in seconds; if None use the default.
2764 A SearchResults containing a list of documents matched, number returned
2765 and number matched by the query.
2768 TypeError: If any of the parameters have invalid types, or an unknown
2769 attribute is passed.
2770 ValueError: If any of the parameters have invalid values (e.g., a
2778 app_id
= kwargs
.pop('app_id', None)
2780 raise TypeError('Invalid arguments: %s' % ', '.join(kwargs
))
2782 request
= search_service_pb
.SearchRequest()
2784 request
.set_app_id(app_id
)
2786 params
= request
.mutable_params()
2787 if isinstance(query
, basestring
):
2788 query
= Query(query_string
=query
)
2789 _CopyMetadataToProtocolBuffer(self
, params
.mutable_index_spec())
2790 _CopyQueryObjectToProtocolBuffer(query
, params
)
2792 response
= search_service_pb
.SearchResponse()
2794 _MakeSyncSearchServiceCall('Search', request
, response
, deadline
)
2796 _CheckStatus(response
.status())
2799 cursor
= query
.options
.cursor
2800 return self
._NewSearchResults
(response
, cursor
)
2802 def _NewGetResponse(self
, response
):
2803 """Returns a GetResponse from the list_documents response pb."""
2805 for doc_proto
in response
.document_list():
2806 documents
.append(_NewDocumentFromPb(doc_proto
))
2808 return GetResponse(results
=documents
)
2810 def _GetRange(self
, start_id
=None, include_start_object
=True,
2811 limit
=100, ids_only
=False, deadline
=None, app_id
=None):
2812 """Get a range of objects in the index, in id order in a response."""
2813 request
= search_service_pb
.ListDocumentsRequest()
2815 request
.set_app_id(app_id
)
2817 params
= request
.mutable_params()
2818 _CopyMetadataToProtocolBuffer(self
, params
.mutable_index_spec())
2821 params
.set_start_doc_id(start_id
)
2822 params
.set_include_start_doc(include_start_object
)
2824 params
.set_limit(_CheckInteger(
2825 limit
, 'limit', zero_ok
=False,
2826 upper_bound
=MAXIMUM_DOCUMENTS_RETURNED_PER_SEARCH
))
2827 params
.set_keys_only(ids_only
)
2829 response
= search_service_pb
.ListDocumentsResponse()
2830 _MakeSyncSearchServiceCall('ListDocuments', request
, response
, deadline
)
2832 _CheckStatus(response
.status())
2835 @datastore_rpc._positional
(5)
2836 def get_range(self
, start_id
=None, include_start_object
=True,
2837 limit
=100, ids_only
=False, deadline
=None, **kwargs
):
2838 """Get a range of Documents in the index, in id order.
2841 start_id: String containing the Id from which to list
2842 Documents from. By default, starts at the first Id.
2843 include_start_object: If true, include the Document with the
2844 Id specified by the start_id parameter.
2845 limit: The maximum number of Documents to return.
2846 ids_only: If true, the Documents returned only contain their keys.
2849 deadline: Deadline for RPC call in seconds; if None use the default.
2852 A GetResponse containing a list of Documents, ordered by Id.
2855 Error: Some subclass of Error is raised if an error occurred processing
2857 TypeError: If any of the parameters have invalid types, or an unknown
2858 attribute is passed.
2859 ValueError: If any of the parameters have invalid values (e.g., a
2863 app_id
= kwargs
.pop('app_id', None)
2865 raise TypeError('Invalid arguments: %s' % ', '.join(kwargs
))
2866 response
= self
._GetRange
(
2867 start_id
=start_id
, include_start_object
=include_start_object
,
2868 limit
=limit
, ids_only
=ids_only
, deadline
=deadline
, app_id
=app_id
)
2869 return self
._NewGetResponse
(response
)
2872 _CURSOR_TYPE_PB_MAP
= {
2873 None: search_service_pb
.SearchParams
.NONE
,
2874 Index
.RESPONSE_CURSOR
: search_service_pb
.SearchParams
.SINGLE
,
2875 Index
.RESULT_CURSOR
: search_service_pb
.SearchParams
.PER_RESULT
2880 _SOURCES_TO_PB_MAP
= {
2881 Index
.SEARCH
: search_service_pb
.IndexSpec
.SEARCH
,
2882 Index
.DATASTORE
: search_service_pb
.IndexSpec
.DATASTORE
,
2883 Index
.CLOUD_STORAGE
: search_service_pb
.IndexSpec
.CLOUD_STORAGE
}
2887 _SOURCE_PB_TO_SOURCES_MAP
= {
2888 search_service_pb
.IndexSpec
.SEARCH
: Index
.SEARCH
,
2889 search_service_pb
.IndexSpec
.DATASTORE
: Index
.DATASTORE
,
2890 search_service_pb
.IndexSpec
.CLOUD_STORAGE
: Index
.CLOUD_STORAGE
}
2893 def _CopyMetadataToProtocolBuffer(index
, spec_pb
):
2894 """Copies Index specification to a search_service_pb.IndexSpec."""
2895 spec_pb
.set_name(index
.name
.encode('utf-8'))
2896 spec_pb
.set_namespace(index
.namespace
.encode('utf-8'))
2899 if index
._source
!= Index
.SEARCH
:
2900 spec_pb
.set_source(_SOURCES_TO_PB_MAP
.get(index
._source
))
2904 document_pb
.FieldValue
.TEXT
: Field
.TEXT
,
2905 document_pb
.FieldValue
.HTML
: Field
.HTML
,
2906 document_pb
.FieldValue
.ATOM
: Field
.ATOM
,
2907 document_pb
.FieldValue
.DATE
: Field
.DATE
,
2908 document_pb
.FieldValue
.NUMBER
: Field
.NUMBER
,
2909 document_pb
.FieldValue
.GEO
: Field
.GEO_POINT
,
2913 def _NewSchemaFromPb(field_type_pb_list
):
2914 """Creates map of field name to type list from document_pb.FieldTypes list."""
2916 for field_type_pb
in field_type_pb_list
:
2917 for field_type
in field_type_pb
.type_list():
2918 public_type
= _FIELD_TYPE_MAP
[field_type
]
2919 name
= _DecodeUTF8(field_type_pb
.name())
2920 if name
in field_types
:
2921 field_types
[name
].append(public_type
)
2923 field_types
[name
] = [public_type
]
2927 def _NewIndexFromIndexSpecPb(index_spec_pb
):
2928 """Creates an Index from a search_service_pb.IndexSpec."""
2929 source
= _SOURCE_PB_TO_SOURCES_MAP
.get(index_spec_pb
.source())
2931 if index_spec_pb
.has_namespace():
2932 index
= Index(name
=index_spec_pb
.name(),
2933 namespace
=index_spec_pb
.namespace(),
2936 index
= Index(name
=index_spec_pb
.name(), source
=source
)
2940 def _NewIndexFromPb(index_metadata_pb
):
2941 """Creates an Index from a search_service_pb.IndexMetadata."""
2942 index
= _NewIndexFromIndexSpecPb(index_metadata_pb
.index_spec())
2943 if index_metadata_pb
.field_list():
2944 index
._schema
= _NewSchemaFromPb(index_metadata_pb
.field_list())
2945 if index_metadata_pb
.has_storage():
2946 index
._storage
_usage
= index_metadata_pb
.storage().amount_used()
2947 index
._storage
_limit
= index_metadata_pb
.storage().limit()
2951 def _MakeSyncSearchServiceCall(call
, request
, response
, deadline
):
2952 """Make a synchronous call to search service.
2954 If the deadline is not None, waits only until the deadline expires.
2957 call: Method name to call, as a string
2958 request: The request object
2959 response: The response object
2962 deadline: Deadline for RPC call in seconds; if None use the default.
2965 TypeError: if the deadline is not a number and is not None.
2966 ValueError: If the deadline is less than zero.
2969 if deadline
is None:
2970 apiproxy_stub_map
.MakeSyncCall('search', call
, request
, response
)
2974 if (not isinstance(deadline
, (int, long, float))
2975 or isinstance(deadline
, (bool,))):
2976 raise TypeError('deadline argument should be int/long/float (%r)'
2979 raise ValueError('deadline argument must be > 0 (%s)' % (deadline
,))
2980 rpc
= apiproxy_stub_map
.UserRPC('search', deadline
=deadline
)
2981 rpc
.make_call(call
, request
, response
)
2984 except apiproxy_errors
.ApplicationError
, e
:
2985 raise _ToSearchError(e
)