Classifier Python :: 2.4 to 2.7
[docutils.git] / sandbox / py-rest-doc / sphinx / search.py
blob4507bcbcae8da9100f8f53e8269e1cc8bb76a11b
1 # -*- coding: utf-8 -*-
2 """
3 sphinx.search
4 ~~~~~~~~~~~~~
6 Create a search index for offline search.
8 :copyright: 2007 by Armin Ronacher.
9 :license: Python license.
10 """
11 import re
12 import pickle
14 from collections import defaultdict
15 from docutils.nodes import Text, NodeVisitor
16 from .stemmer import PorterStemmer
17 from .json import dump_json
20 word_re = re.compile(r'\w+(?u)')
23 class Stemmer(PorterStemmer):
24 """
25 All those porter stemmer implementations look hideous.
26 make at least the stem method nicer.
27 """
29 def stem(self, word):
30 return PorterStemmer.stem(self, word, 0, len(word) - 1)
33 class WordCollector(NodeVisitor):
34 """
35 A special visitor that collects words for the `IndexBuilder`.
36 """
38 def __init__(self, document):
39 NodeVisitor.__init__(self, document)
40 self.found_words = []
42 def dispatch_visit(self, node):
43 if node.__class__ is Text:
44 self.found_words.extend(word_re.findall(node.astext()))
47 class IndexBuilder(object):
48 """
49 Helper class that creates a searchindex based on the doctrees
50 passed to the `feed` method.
51 """
52 formats = {
53 'json': dump_json,
54 'pickle': pickle.dumps
57 def __init__(self):
58 self._filenames = {}
59 self._mapping = {}
60 self._titles = {}
61 self._categories = {}
62 self._stemmer = Stemmer()
64 def dump(self, stream, format):
65 """Dump the freezed index to a stream."""
66 stream.write(self.formats[format](self.freeze()))
68 def freeze(self):
69 """
70 Create a useable data structure. You can pass this output
71 to the `SearchFrontend` to search the index.
72 """
73 return [
74 [k for k, v in sorted(self._filenames.items(),
75 key=lambda x: x[1])],
76 dict(item for item in sorted(self._categories.items(),
77 key=lambda x: x[0])),
78 [v for k, v in sorted(self._titles.items(),
79 key=lambda x: x[0])],
80 dict(item for item in sorted(self._mapping.items(),
81 key=lambda x: x[0])),
84 def feed(self, filename, category, title, doctree):
85 """Feed a doctree to the index."""
86 file_id = self._filenames.setdefault(filename, len(self._filenames))
87 self._titles[file_id] = title
88 visitor = WordCollector(doctree)
89 doctree.walk(visitor)
90 self._categories.setdefault(category, set()).add(file_id)
91 for word in word_re.findall(title) + visitor.found_words:
92 self._mapping.setdefault(self._stemmer.stem(word.lower()),
93 set()).add(file_id)
96 class SearchFrontend(object):
97 """
98 This class acts as a frontend for the search index. It can search
99 a searchindex as provided by `IndexBuilder`.
102 def __init__(self, index):
103 self.filenames, self.areas, self.titles, self.words = index
104 self._stemmer = Stemmer()
106 def query(self, required, excluded, areas):
107 file_map = defaultdict(set)
108 for word in required:
109 if word not in self.words:
110 break
111 for fid in self.words[word]:
112 file_map[fid].add(word)
114 return sorted(((self.filenames[fid], self.titles[fid])
115 for fid, words in file_map.iteritems()
116 if len(words) == len(required) and
117 any(fid in self.areas.get(area, ()) for area in areas) and not
118 any(fid in self.words.get(word, ()) for word in excluded)
119 ), key=lambda x: x[1].lower())
121 def search(self, searchstring, areas):
122 required = set()
123 excluded = set()
124 for word in searchstring.split():
125 if word.startswith('-'):
126 storage = excluded
127 word = word[1:]
128 else:
129 storage = required
130 storage.add(self._stemmer.stem(word.lower()))
132 return self.query(required, excluded, areas)