Optimise diversification by implementing C2-GLS
[xapian.git] / xapian-bindings / python3 / extra.i
blob8e0f49a82a56fed1bab36d88ec4fbbdfe7328105
1 %{
2 /* python/extra.i: Xapian scripting python interface additional python code.
4 * Copyright (C) 2003,2004,2005 James Aylett
5 * Copyright (C) 2005,2006,2007,2008,2009,2010,2011,2013 Olly Betts
6 * Copyright (C) 2007 Lemur Consulting Ltd
7 * Copyright (C) 2010 Richard Boulton
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
22 * USA
26 %pythoncode %{
28 # Set the documentation format - this is used by tools like "epydoc" to decide
29 # how to format the documentation strings.
30 __docformat__ = "restructuredtext en"
32 ##################################
33 # Support for iteration of MSets #
34 ##################################
36 class MSetItem(object):
37 """An item returned from iteration of the MSet.
39 The item supports access to the following attributes and properties:
41 - `docid`: The Xapian document ID corresponding to this MSet item.
42 - `weight`: The weight corresponding to this MSet item.
43 - `rank`: The rank of this MSet item. The rank is the position in the
44 total set of matching documents of this item. The highest document is
45 given a rank of 0. If the MSet did not start at the highest matching
46 document, because a non-zero 'start' parameter was supplied to
47 get_mset(), the first document in the MSet will have a rank greater than
48 0 (in fact, it will be equal to the value of 'start' supplied to
49 get_mset()).
50 - `percent`: The percentage score assigned to this MSet item.
51 - `document`: The document for this MSet item. This can be used to access
52 the document data, or any other information stored in the document (such
53 as term lists). It is lazily evaluated.
54 - `collapse_key`: The value of the key which was used for collapsing.
55 - `collapse_count`: An estimate of the number of documents that have been
56 collapsed into this one.
58 The collapse count estimate will always be less than or equal to the actual
59 number of other documents satisfying the match criteria with the same
60 collapse key as this document. If may be 0 even though there are other
61 documents with the same collapse key which satisfying the match criteria.
62 However if this method returns non-zero, there definitely are other such
63 documents. So this method may be used to inform the user that there are
64 "at least N other matches in this group", or to control whether to offer a
65 "show other documents in this group" feature (but note that it may not
66 offer it in every case where it would show other documents).
68 """
70 __slots__ = ('_mset', '_firstitem', 'docid', 'weight', 'rank',
71 'percent', 'collapse_key', 'collapse_count', '_document', )
73 def __init__(self, iter, mset):
74 self._mset = mset
75 self._firstitem = self._mset.get_firstitem()
76 self.docid = iter.get_docid()
77 self.weight = iter.get_weight()
78 self.rank = iter.get_rank()
79 self.percent = iter.get_percent()
80 self.collapse_key = iter.get_collapse_key()
81 self.collapse_count = iter.get_collapse_count()
82 self._document = None
84 def _get_document(self):
85 if self._document is None:
86 self._document = self._mset._get_hit_internal(self.rank - self._firstitem).get_document()
87 return self._document
89 document = property(_get_document, doc="The document object corresponding to this MSet item.")
91 class MSetIter(object):
92 """An iterator over the items in an MSet.
94 The iterator will return MSetItem objects, which will be evaluated lazily
95 where appropriate.
97 """
98 __slots__ = ('_iter', '_end', '_mset')
99 def __init__(self, mset):
100 self._iter = mset._begin()
101 self._end = mset._end()
102 self._mset = mset
104 def __iter__(self):
105 return self
107 def __next__(self):
108 if self._iter == self._end:
109 raise StopIteration
110 else:
111 r = MSetItem(self._iter, self._mset)
112 next(self._iter)
113 return r
115 # Modify the MSet to allow access to the python iterators, and have other
116 # convenience methods.
118 def _mset_gen_iter(self):
119 """Return an iterator over the MSet.
121 The iterator will return MSetItem objects, which will be evaluated lazily
122 where appropriate.
125 return MSetIter(self)
126 MSet.__iter__ = _mset_gen_iter
128 MSet.__len__ = lambda self: MSet.size(self)
130 def _mset_getitem(self, index):
131 """Get an item from the MSet.
133 The supplied index is relative to the start of the MSet, not the absolute
134 rank of the item.
136 Returns an MSetItem.
139 if index < 0:
140 index += len(self)
141 if index < 0 or index >= len(self):
142 raise IndexError("Mset index out of range")
143 return MSetItem(self._get_hit_internal(index), self)
144 MSet.__getitem__ = _mset_getitem
145 MSet.get_hit = _mset_getitem
148 ##################################
149 # Support for iteration of ESets #
150 ##################################
152 class ESetItem(object):
153 """An item returned from iteration of the ESet.
155 The item supports access to the following attributes:
157 - `term`: The term corresponding to this ESet item.
158 - `weight`: The weight corresponding to this ESet item.
161 __slots__ = ('term', 'weight')
163 def __init__(self, iter):
164 self.term = iter.get_term()
165 self.weight = iter.get_weight()
167 class ESetIter(object):
168 """An iterator over the items in an ESet.
170 The iterator will return ESetItem objects.
173 __slots__ = ('_iter', '_end')
174 def __init__(self, eset):
175 self._iter = eset._begin()
176 self._end = eset._end()
178 def __iter__(self):
179 return self
181 def __next__(self):
182 if self._iter == self._end:
183 raise StopIteration
184 else:
185 r = ESetItem(self._iter)
186 next(self._iter)
187 return r
189 # Modify the ESet to allow access to the python iterators, and have other
190 # convenience methods.
192 def _eset_gen_iter(self):
193 """Return an iterator over the ESet.
195 The iterator will return ESetItem objects.
198 return ESetIter(self)
199 ESet.__iter__ = _eset_gen_iter
201 ESet.__len__ = lambda self: ESet.size(self)
204 #######################################
205 # Support for iteration of term lists #
206 #######################################
208 class TermListItem(object):
209 """An item returned from iteration of a term list.
211 The item supports access to the following attributes and properties:
213 - `term`: The term corresponding to this TermListItem.
214 - `wdf`: The within document frequency of this term.
215 - `termfreq`: The number of documents in the collection which are indexed
216 by the term
217 - `positer`: An iterator over the positions which the term appears at in
218 the document. This is only available until the iterator which returned
219 this item next moves.
222 __slots__ = ('_iter', 'term', '_wdf', '_termfreq')
224 def __init__(self, iter, term):
225 self._iter = iter
226 self.term = term
227 self._wdf = None
228 self._termfreq = None
230 if iter._has_wdf == TermIter.EAGER:
231 self._wdf = iter._iter.get_wdf()
232 if iter._has_termfreq == TermIter.EAGER:
233 self._termfreq = iter._iter.get_termfreq()
235 # Support for sequence API
236 sequence = ['term', 'wdf', 'termfreq', 'positer']
237 if iter._has_wdf == TermIter.INVALID:
238 sequence[1] = 0
239 if iter._has_termfreq == TermIter.INVALID:
240 sequence[2] = 0
241 if iter._has_positions == TermIter.INVALID:
242 sequence[3] = PositionIter()
244 def _get_wdf(self):
245 """Get the within-document-frequency of the current term.
247 This will raise a InvalidOperationError exception if the iterator this
248 item came from doesn't support within-document-frequencies.
251 if self._wdf is None:
252 if self._iter._has_wdf == TermIter.INVALID:
253 raise InvalidOperationError("Iterator does not support wdfs")
254 if self.term is not self._iter._lastterm:
255 raise InvalidOperationError("Iterator has moved, and does not support random access")
256 self._wdf = self._iter._iter.get_wdf()
257 return self._wdf
258 wdf = property(_get_wdf, doc=
259 """The within-document-frequency of the current term (if meaningful).
261 This will raise a InvalidOperationError exception if the iterator
262 this item came from doesn't support within-document-frequencies.
264 """)
266 def _get_termfreq(self):
267 """Get the term frequency.
269 This is the number of documents in the collection which are indexed by
270 the term.
272 This will raise a InvalidOperationError exception if the iterator this
273 item came from doesn't support term frequencies.
276 if self._termfreq is None:
277 if self._iter._has_termfreq == TermIter.INVALID:
278 raise InvalidOperationError("Iterator does not support term frequencies")
279 if self.term is not self._iter._lastterm:
280 raise InvalidOperationError("Iterator has moved, and does not support random access")
281 self._termfreq = self._iter._iter.get_termfreq()
282 return self._termfreq
283 termfreq = property(_get_termfreq, doc=
284 """The term frequency of the current term (if meaningful).
286 This is the number of documents in the collection which are indexed by the
287 term.
289 This will raise a InvalidOperationError exception if the iterator
290 this item came from doesn't support term frequencies.
292 """)
294 def _get_positer(self):
295 """Get a position list iterator.
297 The iterator will return integers representing the positions that the
298 term occurs at.
300 This will raise a InvalidOperationError exception if the iterator this
301 item came from doesn't support position lists, or if the iterator has
302 moved on since the item was returned from it.
305 if self._iter._has_positions == TermIter.INVALID:
306 raise InvalidOperationError("Iterator does not support position lists")
307 # Access to position lists is always lazy, so we don't need to check
308 # _has_positions.
309 if self.term is not self._iter._lastterm:
310 raise InvalidOperationError("Iterator has moved, and does not support random access")
311 return PositionIter(self._iter._iter._positionlist_begin(),
312 self._iter._iter._positionlist_end())
313 positer = property(_get_positer, doc=
314 """A position iterator for the current term (if meaningful).
316 The iterator will return integers representing the positions that the term
317 occurs at.
319 This will raise a InvalidOperationError exception if the iterator this item
320 came from doesn't support position lists, or if the iterator has moved on
321 since the item was returned from it.
323 """)
326 class TermIter(object):
327 """An iterator over a term list.
329 The iterator will return TermListItem objects, which will be evaluated
330 lazily where appropriate.
333 __slots__ = ('_iter', '_end', '_has_termfreq', '_has_wdf',
334 '_has_positions', '_return_strings', '_lastterm', '_moved')
336 INVALID = 0
337 LAZY = 1
338 EAGER = 2
340 def __init__(self, start, end, has_termfreq=INVALID,
341 has_wdf=INVALID, has_positions=INVALID,
342 return_strings=False):
343 self._iter = start
344 self._end = end
345 self._has_termfreq = has_termfreq
346 self._has_wdf = has_wdf
347 self._has_positions = has_positions
348 assert(has_positions != TermIter.EAGER) # Can't do eager access to position lists
349 self._return_strings = return_strings
350 self._lastterm = None # Used to test if the iterator has moved
352 # _moved is True if we've moved onto the next item. This is needed so
353 # that the iterator doesn't have to move on until just before next() is
354 # called: since the iterator starts by pointing at a valid item, we
355 # can't just call next(self._iter) unconditionally at the start of our
356 # __next__() method.
357 self._moved = True
359 def __iter__(self):
360 return self
362 def __next__(self):
363 if not self._moved:
364 next(self._iter)
365 self._moved = True
367 if self._iter == self._end:
368 self._lastterm = None
369 raise StopIteration
370 else:
371 self._lastterm = self._iter.get_term()
372 self._moved = False
373 if self._return_strings:
374 return self._lastterm
375 return TermListItem(self, self._lastterm)
377 def skip_to(self, term):
378 """Skip the iterator forward.
380 The iterator is advanced to the first term at or after the current
381 position which is greater than or equal to the supplied term.
383 If there are no such items, this will raise StopIteration.
385 This returns the item which the iterator is moved to. The subsequent
386 item will be returned the next time that next() is called (unless
387 skip_to() is called again first).
390 if self._iter != self._end:
391 self._iter.skip_to(term)
393 if self._iter == self._end:
394 self._lastterm = None
395 self._moved = True
396 raise StopIteration
398 # Update self._lastterm if the iterator has moved.
399 # TermListItems compare a saved value of lastterm with self._lastterm
400 # with the object identity comparator, so it is important to ensure
401 # that it does not get modified if the new term compares equal.
402 newterm = self._iter.get_term()
403 if newterm != self._lastterm:
404 self._lastterm = newterm
406 self._moved = False
407 if self._return_strings:
408 return self._lastterm
409 return TermListItem(self, self._lastterm)
411 # Modify Enquire to add a "matching_terms()" method.
412 def _enquire_gen_iter(self, which):
413 """Get an iterator over the terms which match a given match set item.
415 The match set item to consider is specified by the `which` parameter, which
416 may be a document ID, or an MSetItem object.
418 The iterator will return string objects.
421 if isinstance(which, MSetItem):
422 which = which.docid
423 return TermIter(self._get_matching_terms_begin(which),
424 self._get_matching_terms_end(which),
425 return_strings=True)
426 Enquire.matching_terms = _enquire_gen_iter
428 # Modify Query to add an "__iter__()" method.
429 def _query_gen_iter(self):
430 """Get an iterator over the terms in a query.
432 The iterator will return string objects.
435 return TermIter(self._get_terms_begin(),
436 self._get_terms_end(),
437 return_strings=True)
438 Query.__iter__ = _query_gen_iter
440 # Modify Database to add an "__iter__()" method and an "allterms()" method.
441 def _database_gen_allterms_iter(self, prefix=None):
442 """Get an iterator over all the terms in the database.
444 The iterator will return TermListItem objects, but these will not support
445 access to wdf, or position information.
447 Access to term frequency information is only available until the iterator
448 has moved on.
450 If prefix is supplied, only terms which start with that prefix will be
451 returned.
454 if prefix is None:
455 return TermIter(self._allterms_begin(), self._allterms_end(),
456 has_termfreq=TermIter.LAZY)
457 else:
458 return TermIter(self._allterms_begin(prefix), self._allterms_end(prefix),
459 has_termfreq=TermIter.LAZY)
460 Database.__iter__ = _database_gen_allterms_iter
461 Database.allterms = _database_gen_allterms_iter
463 # Modify Database to add a "termlist()" method.
464 def _database_gen_termlist_iter(self, docid):
465 """Get an iterator over all the terms which index a given document ID.
467 The iterator will return TermListItem objects.
469 Access to term frequency and position information is only available until
470 the iterator has moved on.
473 # Note: has_termfreq is set to LAZY because most databases don't store term
474 # frequencies in the termlist (because this would require updating many termlist
475 # entries for every document update), so access to the term frequency requires a
476 # separate lookup.
477 return TermIter(self._termlist_begin(docid), self._termlist_end(docid),
478 has_termfreq=TermIter.LAZY,
479 has_wdf=TermIter.EAGER,
480 has_positions=TermIter.LAZY)
481 Database.termlist = _database_gen_termlist_iter
483 # Modify Database to add a "spellings()" method.
484 def _database_gen_spellings_iter(self):
485 """Get an iterator which returns all the spelling correction targets
487 The iterator will return TermListItem objects. Only the term frequency is
488 available; wdf and positions are not meaningful.
491 return TermIter(self._spellings_begin(), self._spellings_end(),
492 has_termfreq=TermIter.EAGER,
493 has_wdf=TermIter.INVALID,
494 has_positions=TermIter.INVALID)
495 Database.spellings = _database_gen_spellings_iter
497 # Modify Database to add a "synonyms()" method.
498 def _database_gen_synonyms_iter(self, term):
499 """Get an iterator which returns all the synonyms for a given term.
501 The term to return synonyms for is specified by the `term` parameter.
503 The iterator will return string objects.
506 return TermIter(self._synonyms_begin(term),
507 self._synonyms_end(term),
508 return_strings=True)
509 Database.synonyms = _database_gen_synonyms_iter
511 # Modify Database to add a "synonym_keys()" method.
512 def _database_gen_synonym_keys_iter(self, prefix=""):
513 """Get an iterator which returns all the terms which have synonyms.
515 The iterator will return string objects.
517 If `prefix` is non-empty, only terms with this prefix are returned.
520 return TermIter(self._synonym_keys_begin(prefix),
521 self._synonym_keys_end(prefix),
522 return_strings=True)
523 Database.synonym_keys = _database_gen_synonym_keys_iter
525 # Modify Database to add a "metadata_keys()" method, instead of direct access
526 # to metadata_keys_begin and metadata_keys_end.
527 def _database_gen_metadata_keys_iter(self, prefix=""):
528 """Get an iterator which returns all the metadata keys.
530 The iterator will return string objects.
532 If `prefix` is non-empty, only metadata keys with this prefix are returned.
535 return TermIter(self._metadata_keys_begin(prefix),
536 self._metadata_keys_end(prefix),
537 return_strings=True)
538 Database.metadata_keys = _database_gen_metadata_keys_iter
540 # Modify Document to add an "__iter__()" method and a "termlist()" method.
541 def _document_gen_termlist_iter(self):
542 """Get an iterator over all the terms in a document.
544 The iterator will return TermListItem objects.
546 Access to term frequency and position information is only available until
547 the iterator has moved on.
549 Note that term frequency information is only meaningful for a document
550 retrieved from a database. If term frequency information is requested for
551 a document which was freshly created, an InvalidOperationError will be
552 raised.
555 # Note: document termlist iterators may be implemented entirely in-memory
556 # (in which case access to all items could be allowed eagerly), but may
557 # also be implemented by returning a database termlist (for documents which
558 # are stored in a database, rather than freshly created). We choose the
559 # most conservative settings, to avoid doing eager access when lazy access
560 # would be more appropriate.
561 return TermIter(self._termlist_begin(), self._termlist_end(),
562 has_termfreq=TermIter.LAZY,
563 has_wdf=TermIter.EAGER,
564 has_positions=TermIter.LAZY)
565 Document.__iter__ = _document_gen_termlist_iter
566 Document.termlist = _document_gen_termlist_iter
568 # Modify QueryParser to add a "stoplist()" method.
569 def _queryparser_gen_stoplist_iter(self):
570 """Get an iterator over all the stopped terms from the previous query.
572 This returns an iterator over all the terms which were omitted from the
573 previously parsed query due to being considered to be stopwords. Each
574 instance of a word omitted from the query is represented in the returned
575 list, in the order in which the
577 The iterator will return string objects.
580 return TermIter(self._stoplist_begin(), self._stoplist_end(),
581 return_strings=True)
582 QueryParser.stoplist = _queryparser_gen_stoplist_iter
584 # Modify QueryParser to add an "unstemlist()" method.
585 def _queryparser_gen_unstemlist_iter(self, tname):
586 """Get an iterator over all the unstemmed forms of a stemmed term.
588 This returns an iterator which returns all the unstemmed words which were
589 stemmed to the stemmed form specified by `tname` when parsing the previous
590 query. Each instance of a word which stems to `tname` is returned by the
591 iterator in the order in which the words appeared in the query - an
592 individual unstemmed word may thus occur multiple times.
594 The iterator will return string objects.
597 return TermIter(self._unstem_begin(tname), self._unstem_end(tname),
598 return_strings=True)
599 QueryParser.unstemlist = _queryparser_gen_unstemlist_iter
601 # Modify ValueCountMatchSpy to add an "values()" method.
602 def wrapper():
603 begin = ValueCountMatchSpy.values_begin
604 del ValueCountMatchSpy.values_begin
605 end = ValueCountMatchSpy.values_end
606 del ValueCountMatchSpy.values_end
607 def values(self):
608 """Get an iterator over all the values in the slot.
610 Values will be returned in ascending alphabetical order.
612 The iterator will return TermListItem objects: the value can be
613 accessed as the `term` property, and the frequency can be accessed as
614 the `termfreq` property.
617 return TermIter(begin(self), end(self), has_termfreq=TermIter.EAGER)
618 return values
619 ValueCountMatchSpy.values = wrapper()
620 del wrapper
622 # Modify ValueCountMatchSpy to add an "top_values()" method.
623 def wrapper():
624 begin = ValueCountMatchSpy.top_values_begin
625 del ValueCountMatchSpy.top_values_begin
626 end = ValueCountMatchSpy.top_values_end
627 del ValueCountMatchSpy.top_values_end
628 def top_values(self, maxvalues):
629 """Get an iterator over the most frequent values for the slot.
631 Values will be returned in descending order of frequency. Values with
632 the same frequency will be returned in ascending alphabetical order.
634 The iterator will return TermListItem objects: the value can be
635 accessed as the `term` property, and the frequency can be accessed as
636 the `termfreq` property.
639 return TermIter(begin(self, maxvalues), end(self, maxvalues),
640 has_termfreq=TermIter.EAGER)
641 return top_values
642 ValueCountMatchSpy.top_values = wrapper()
643 del wrapper
645 # When we make a query, keep a note of postingsources involved, so they won't
646 # be deleted. This hack can probably be removed once xapian bug #186 is fixed.
647 __query_init_orig = Query.__init__
648 def _query_init(self, *args):
649 """Make a new query object.
651 Many possible arguments are possible - see the documentation for details.
654 ps = []
655 if len(args) == 1 and isinstance(args[0], PostingSource):
656 ps.append(args[0])
657 else:
658 for arg in args:
659 if isinstance(arg, Query):
660 ps.extend(getattr(arg, '_ps', []))
661 elif hasattr(arg, '__iter__'):
662 for listarg in arg:
663 if isinstance(listarg, Query):
664 ps.extend(getattr(listarg, '_ps', []))
665 __query_init_orig(self, *args)
666 self._ps = ps
667 Query.__init__ = _query_init
668 del _query_init
670 # When setting a query on enquire, keep a note of postingsources involved, so
671 # they won't be deleted. This hack can probably be removed once xapian bug #186
672 # is fixed.
673 __enquire_set_query_orig = Enquire.set_query
674 def _enquire_set_query(self, query, qlen=0):
675 self._ps = getattr(query, '_ps', [])
676 return __enquire_set_query_orig(self, query, qlen)
677 _enquire_set_query.__doc__ = __enquire_set_query_orig.__doc__
678 Enquire.set_query = _enquire_set_query
679 del _enquire_set_query
681 # When getting a query from enquire, keep a note of postingsources involved,
682 # so they won't be deleted. This hack can probably be removed once xapian bug
683 # #186 is fixed.
684 __enquire_get_query_orig = Enquire.get_query
685 def _enquire_get_query(self):
686 query = __enquire_get_query_orig(self)
687 query._ps = getattr(self, '_ps', [])
688 return query
689 _enquire_get_query.__doc__ = __enquire_get_query_orig.__doc__
690 Enquire.get_query = _enquire_get_query
691 del _enquire_get_query
693 # When we set a ValueRangeProcessor into the QueryParser, keep a python
694 # reference so it won't be deleted. This hack can probably be removed once
695 # xapian bug #186 is fixed.
696 __queryparser_add_valuerangeprocessor_orig = QueryParser.add_valuerangeprocessor
697 def _queryparser_add_valuerangeprocessor(self, vrproc):
698 if not hasattr(self, '_vrps'):
699 self._vrps = []
700 self._vrps.append(vrproc)
701 return __queryparser_add_valuerangeprocessor_orig(self, vrproc)
702 _queryparser_add_valuerangeprocessor.__doc__ = __queryparser_add_valuerangeprocessor_orig.__doc__
703 QueryParser.add_valuerangeprocessor = _queryparser_add_valuerangeprocessor
704 del _queryparser_add_valuerangeprocessor
706 # When we set a RangeProcessor into the QueryParser, keep a python
707 # reference so it won't be deleted. This hack can probably be removed once
708 # xapian bug #186 is fixed.
709 __queryparser_add_rangeprocessor_orig = QueryParser.add_rangeprocessor
710 def _queryparser_add_rangeprocessor(self, rproc):
711 if not hasattr(self, '_rps'):
712 self._rps = []
713 self._rps.append(rproc)
714 return __queryparser_add_rangeprocessor_orig(self, rproc)
715 _queryparser_add_rangeprocessor.__doc__ = __queryparser_add_rangeprocessor_orig.__doc__
716 QueryParser.add_rangeprocessor = _queryparser_add_rangeprocessor
717 del _queryparser_add_rangeprocessor
719 # When we set a FieldProcessor into the QueryParser, keep a python
720 # reference so it won't be deleted. This hack can probably be removed once
721 # xapian bug #186 is fixed.
722 __queryparser_add_prefix_orig = QueryParser.add_prefix
723 def _queryparser_add_prefix(self, s, proc):
724 if not isinstance(proc, (str, bytes)):
725 if not hasattr(self, '_fps'):
726 self._fps = []
727 self._fps.append(proc)
728 return __queryparser_add_prefix_orig(self, s, proc)
729 _queryparser_add_prefix.__doc__ = __queryparser_add_prefix_orig.__doc__
730 QueryParser.add_prefix = _queryparser_add_prefix
731 del _queryparser_add_prefix
732 __queryparser_add_boolean_prefix_orig = QueryParser.add_boolean_prefix
733 def _queryparser_add_boolean_prefix(self, s, proc, exclusive = True):
734 if not isinstance(proc, (str, bytes)):
735 if not hasattr(self, '_fps'):
736 self._fps = []
737 self._fps.append(proc)
738 return __queryparser_add_boolean_prefix_orig(self, s, proc, exclusive)
739 _queryparser_add_boolean_prefix.__doc__ = __queryparser_add_boolean_prefix_orig.__doc__
740 QueryParser.add_boolean_prefix = _queryparser_add_boolean_prefix
741 del _queryparser_add_boolean_prefix
743 # When we set a Stopper into the QueryParser, keep a python reference so it
744 # won't be deleted. This hack can probably be removed once xapian bug #186 is
745 # fixed.
746 __queryparser_set_stopper_orig = QueryParser.set_stopper
747 def _queryparser_set_stopper(self, stopper):
748 self._stopper = stopper
749 return __queryparser_set_stopper_orig(self, stopper)
750 _queryparser_set_stopper.__doc__ = __queryparser_set_stopper_orig.__doc__
751 QueryParser.set_stopper = _queryparser_set_stopper
752 del _queryparser_set_stopper
754 # When we set a Stopper into the TermGenerator, keep a python reference so it
755 # won't be deleted. This hack can probably be removed once xapian bug #186 is
756 # fixed.
757 __termgenerator_set_stopper_orig = TermGenerator.set_stopper
758 def _termgenerator_set_stopper(self, stopper):
759 self._stopper = stopper
760 return __termgenerator_set_stopper_orig(self, stopper)
761 _termgenerator_set_stopper.__doc__ = __termgenerator_set_stopper_orig.__doc__
762 TermGenerator.set_stopper = _termgenerator_set_stopper
763 del _termgenerator_set_stopper
765 # When we set a Sorter on enquire, keep a python reference so it won't be
766 # deleted. This hack can probably be removed once xapian bug #186 is fixed.
767 __enquire_set_sort_by_key_orig = Enquire.set_sort_by_key
768 def _enquire_set_sort_by_key(self, sorter, reverse):
769 self._sorter = sorter
770 return __enquire_set_sort_by_key_orig(self, sorter, reverse)
771 _enquire_set_sort_by_key.__doc__ = __enquire_set_sort_by_key_orig.__doc__
772 Enquire.set_sort_by_key = _enquire_set_sort_by_key
773 del _enquire_set_sort_by_key
775 __enquire_set_sort_by_key_then_relevance_orig = Enquire.set_sort_by_key_then_relevance
776 def _enquire_set_sort_by_key_then_relevance(self, sorter, reverse):
777 self._sorter = sorter
778 return __enquire_set_sort_by_key_then_relevance_orig(self, sorter, reverse)
779 _enquire_set_sort_by_key_then_relevance.__doc__ = __enquire_set_sort_by_key_then_relevance_orig.__doc__
780 Enquire.set_sort_by_key_then_relevance = _enquire_set_sort_by_key_then_relevance
781 del _enquire_set_sort_by_key_then_relevance
783 __enquire_set_sort_by_relevance_then_key_orig = Enquire.set_sort_by_relevance_then_key
784 def _enquire_set_sort_by_relevance_then_key(self, sorter, reverse):
785 self._sorter = sorter
786 return __enquire_set_sort_by_relevance_then_key_orig(self, sorter, reverse)
787 _enquire_set_sort_by_relevance_then_key.__doc__ = __enquire_set_sort_by_relevance_then_key_orig.__doc__
788 Enquire.set_sort_by_relevance_then_key = _enquire_set_sort_by_relevance_then_key
789 del _enquire_set_sort_by_relevance_then_key
792 ##########################################
793 # Support for iteration of posting lists #
794 ##########################################
796 class PostingItem(object):
797 """An item returned from iteration of a posting list.
799 The item supports access to the following attributes and properties:
801 - `docid`: The document ID corresponding to this PostingItem.
802 - `doclength`: The length of the document corresponding to this
803 PostingItem.
804 - `wdf`: The within document frequency of the term which the posting list
805 is for in the document corresponding to this PostingItem.
806 - `positer`: An iterator over the positions which the term corresponing to
807 this posting list occurs at in the document corresponding to this
808 PostingItem. This is only available until the iterator which returned
809 this item next moves.
812 __slots__ = ('_iter', 'docid', 'doclength', 'wdf',)
814 def __init__(self, iter):
815 self._iter = iter
816 self.docid = iter._iter.get_docid()
817 self.doclength = iter._iter.get_doclength()
818 self.wdf = iter._iter.get_wdf()
820 # Support for sequence API
821 sequence = ['docid', 'doclength', 'wdf', 'positer']
822 if not iter._has_positions:
823 sequence[3] = PositionIter()
825 def _get_positer(self):
826 """Get a position list iterator.
828 The iterator will return integers representing the positions that the
829 term occurs at in the document corresponding to this PostingItem.
831 This will raise a InvalidOperationError exception if the iterator this
832 item came from doesn't support position lists, or if the iterator has
833 moved on since the item was returned from it.
836 if not self._iter._has_positions:
837 raise InvalidOperationError("Iterator does not support position lists")
838 if self._iter._iter == self._iter._end or \
839 self.docid != self._iter._iter.get_docid():
840 raise InvalidOperationError("Iterator has moved, and does not support random access")
841 return PositionIter(self._iter._iter._positionlist_begin(),
842 self._iter._iter._positionlist_end())
843 positer = property(_get_positer, doc=
844 """A position iterator for the current posting (if meaningful).
846 The iterator will return integers representing the positions that the term
847 occurs at.
849 This will raise a InvalidOperationError exception if the iterator this item
850 came from doesn't support position lists, or if the iterator has moved on
851 since the item was returned from it.
853 """)
856 class PostingIter(object):
857 """An iterator over a posting list.
859 The iterator will return PostingItem objects, which will be evaluated
860 lazily where appropriate.
863 __slots__ = ('_iter', '_end', '_has_positions', '_moved')
865 def __init__(self, start, end, has_positions=False):
866 self._iter = start
867 self._end = end
868 self._has_positions = has_positions
870 # _moved is True if we've moved onto the next item. This is needed so
871 # that the iterator doesn't have to move on until just before next() is
872 # called: since the iterator starts by pointing at a valid item, we
873 # can't just call next(self._iter) unconditionally at the start of our
874 # __next__() method.
875 self._moved = True
877 def __iter__(self):
878 return self
880 def __next__(self):
881 if not self._moved:
882 next(self._iter)
883 self._moved = True
885 if self._iter == self._end:
886 raise StopIteration
887 else:
888 self._moved = False
889 return PostingItem(self)
891 def skip_to(self, docid):
892 """Skip the iterator forward.
894 The iterator is advanced to the first document with a document ID
895 which is greater than or equal to the supplied document ID.
897 If there are no such items, this will raise StopIteration.
899 This returns the item which the iterator is moved to. The subsequent
900 item will be returned the next time that next() is called (unless
901 skip_to() is called again first).
904 if self._iter != self._end:
905 self._iter.skip_to(docid)
906 if self._iter == self._end:
907 self._moved = True
908 raise StopIteration
909 self._moved = False
910 return PostingItem(self)
912 def _database_gen_postlist_iter(self, tname):
913 """Get an iterator over the postings which are indexed by a given term.
915 If `tname` is empty, an iterator over all the documents will be returned
916 (this will contain one entry for each document, will always return a wdf of
917 1, and will not allow access to a position iterator).
920 if len(tname) != 0:
921 return PostingIter(self._postlist_begin(tname), self._postlist_end(tname),
922 has_positions=True)
923 else:
924 return PostingIter(self._postlist_begin(tname), self._postlist_end(tname))
925 Database.postlist = _database_gen_postlist_iter
928 ###########################################
929 # Support for iteration of position lists #
930 ###########################################
932 class PositionIter(object):
933 """An iterator over a position list.
935 The iterator will return integers, in ascending order.
938 def __init__(self, start = 0, end = 0):
939 self.iter = start
940 self.end = end
942 def __iter__(self):
943 return self
945 def __next__(self):
946 if self.iter==self.end:
947 raise StopIteration
948 else:
949 r = self.iter.get_termpos()
950 next(self.iter)
951 return r
953 # Modify Database to add a "positionlist()" method.
954 def _database_gen_positionlist_iter(self, docid, tname):
955 """Get an iterator over all the positions in a given document of a term.
957 The iterator will return integers, in ascending order.
960 return PositionIter(self._positionlist_begin(docid, tname), self._positionlist_end(docid, tname))
961 Database.positionlist = _database_gen_positionlist_iter
963 ########################################
964 # Support for iteration of value lists #
965 ########################################
967 class ValueItem(object):
968 """An item returned from iteration of the values in a document.
970 The item supports access to the following attributes:
972 - `num`: The number of the value.
973 - `value`: The contents of the value.
977 __slots__ = ('num', 'value', )
979 def __init__(self, num, value):
980 self.num = num
981 self.value = value
983 class ValueIter(object):
984 """An iterator over all the values stored in a document.
986 The iterator will return ValueItem objects, in ascending order of value number.
989 def __init__(self, start, end):
990 self.iter = start
991 self.end = end
993 def __iter__(self):
994 return self
996 def __next__(self):
997 if self.iter==self.end:
998 raise StopIteration
999 else:
1000 r = ValueItem(self.iter.get_valueno(), self.iter.get_value())
1001 next(self.iter)
1002 return r
1004 # Modify Document to add a "values()" method.
1005 def _document_gen_values_iter(self):
1006 """Get an iterator over all the values stored in a document.
1008 The iterator will return ValueItem objects, in ascending order of value number.
1011 return ValueIter(self._values_begin(), self._values_end())
1012 Document.values = _document_gen_values_iter
1015 ##########################################
1016 # Support for iteration of value streams #
1017 ##########################################
1019 class ValueStreamItem(object):
1020 """An item returned from iteration of the values in a document.
1022 The item supports access to the following attributes:
1024 - `docid`: The docid for the item.
1025 - `value`: The contents of the value.
1029 __slots__ = ('docid', 'value', )
1031 def __init__(self, docid, value):
1032 self.docid = docid
1033 self.value = value
1035 class ValueStreamIter(object):
1036 """An iterator over all the values stored in a document.
1038 The iterator will return ValueStreamItem objects, in ascending order of value number.
1041 def __init__(self, start, end):
1042 self.iter = start
1043 self.end = end
1044 self.moved = True
1046 def __iter__(self):
1047 return self
1049 def __next__(self):
1050 if not self.moved:
1051 self.iter.__next__()
1052 self.moved = True
1054 if self.iter==self.end:
1055 raise StopIteration
1056 else:
1057 self.moved = False
1058 return ValueStreamItem(self.iter.get_docid(), self.iter.get_value())
1060 def skip_to(self, docid):
1061 """Skip the iterator forward.
1063 The iterator is advanced to the first document with a document ID
1064 which is greater than or equal to the supplied document ID.
1066 If there are no such items, this will raise StopIteration.
1068 This returns the item which the iterator is moved to. The subsequent
1069 item will be returned the next time that next() is called (unless
1070 skip_to() is called again first).
1073 if self.iter != self.end:
1074 self.iter.skip_to(docid)
1075 if self.iter == self.end:
1076 self.moved = True
1077 raise StopIteration
1078 self.moved = False
1079 return ValueStreamItem(self.iter.get_docid(), self.iter.get_value())
1081 # Modify Database to add a "valuestream()" method, and remove the
1082 # valuestream_begin() and valuestream_end() methods.
1083 def wrapper():
1084 vs_begin = Database.valuestream_begin
1085 del Database.valuestream_begin
1086 vs_end = Database.valuestream_end
1087 del Database.valuestream_end
1088 def valuestream(self, slot):
1089 """Get an iterator over all the values stored in a slot in the database.
1091 The iterator will return ValueStreamItem objects, in ascending order of
1092 document id.
1095 return ValueStreamIter(vs_begin(self, slot), vs_end(self, slot))
1096 return valuestream
1097 Database.valuestream = wrapper()
1098 del wrapper
1100 ##########################################
1101 # Support for iteration of LatLongCoords #
1102 ##########################################
1104 class LatLongCoordsIter(object):
1105 """An iterator over all the coordinates in a LatLongCoords object.
1107 The iterator returns LatLongCoord objects.
1110 def __init__(self, start, end):
1111 self.iter = start
1112 self.end = end
1114 def __iter__(self):
1115 return self
1117 def __eq__(self, other):
1118 return self.equals(other)
1120 def __ne__(self, other):
1121 return not self.equals(other)
1123 def __next__(self):
1124 if self.iter.equals(self.end):
1125 raise StopIteration
1126 else:
1127 r = self.iter.get_coord()
1128 self.iter.__next__()
1129 return r
1131 # Modify LatLongCoords to make it iterable.
1132 def _latlongcoords_iter(self):
1133 """Get an iterator over all the coordinates in a LatLongCoords.
1135 The iterator will return xapian.LatLongCoord objects.
1138 return LatLongCoordsIter(self.begin(), self.end())
1139 LatLongCoords.__iter__ = _latlongcoords_iter
1140 del _latlongcoords_iter
1141 del LatLongCoordsIterator
1143 # Fix up Enquire so that it keeps a python reference to the deciders supplied
1144 # to it so that they won't be deleted before the Enquire object. This hack can
1145 # probably be removed once xapian bug #186 is fixed.
1146 _enquire_add_matchspy_orig = Enquire.add_matchspy
1147 def _enquire_match_spy_add(self, decider):
1148 if not hasattr(self, '_deciders'):
1149 self._deciders = []
1150 self._deciders.append(decider)
1151 _enquire_add_matchspy_orig(self, decider)
1152 _enquire_match_spy_add.__doc__ = Enquire.add_matchspy.__doc__
1153 Enquire.add_matchspy = _enquire_match_spy_add
1155 _enquire_clear_matchspies_orig = Enquire.clear_matchspies
1156 def _enquire_match_spies_clear(self):
1157 _enquire_clear_matchspies_orig(self)
1158 if hasattr(self, '_deciders'):
1159 del self._deciders
1160 _enquire_match_spies_clear.__doc__ = Enquire.clear_matchspies.__doc__
1161 Enquire.clear_matchspies = _enquire_match_spies_clear
1164 # Fix up Stem.__init__() so that it calls __disown__() on the passed
1165 # StemImplementation object so that Python won't delete it from under us.
1166 _stem_init_orig = Stem.__init__
1167 def _stem_init(self, *args):
1168 _stem_init_orig(self, *args)
1169 if len(args) > 0 and isinstance(args[0], StemImplementation):
1170 args[0].__disown__()
1171 _stem_init.__doc__ = Stem.__init__.__doc__
1172 Stem.__init__ = _stem_init
1175 # Remove static methods which shouldn't be in the API.
1176 del Document_unserialise
1177 del Query_unserialise
1178 del Stem_get_available_languages
1180 # Add wrappers for Query::MatchAll and Query::MatchNothing
1181 Query.MatchAll = Query("")
1182 Query.MatchNothing = Query()
1185 # Set the list of names which should be public.
1186 # Note that this needs to happen at the end of xapian.py.
1187 __all__ = []
1188 for item in dir():
1189 if item.startswith('_') or item.endswith('_swigregister') or item.endswith('Iterator'):
1190 continue
1191 __all__.append(item)
1192 __all__ = tuple(__all__)
1195 /* vim:syntax=python:set expandtab: */