1 # -*- coding: utf-8 -*-
6 Create a search index for offline search.
8 :copyright: 2007 by Armin Ronacher.
9 :license: Python license.
14 from collections
import defaultdict
15 from docutils
.nodes
import Text
, NodeVisitor
16 from .stemmer
import PorterStemmer
17 from .json
import dump_json
20 word_re
= re
.compile(r
'\w+(?u)')
23 class Stemmer(PorterStemmer
):
25 All those porter stemmer implementations look hideous.
26 make at least the stem method nicer.
30 return PorterStemmer
.stem(self
, word
, 0, len(word
) - 1)
33 class WordCollector(NodeVisitor
):
35 A special visitor that collects words for the `IndexBuilder`.
38 def __init__(self
, document
):
39 NodeVisitor
.__init
__(self
, document
)
42 def dispatch_visit(self
, node
):
43 if node
.__class
__ is Text
:
44 self
.found_words
.extend(word_re
.findall(node
.astext()))
47 class IndexBuilder(object):
49 Helper class that creates a searchindex based on the doctrees
50 passed to the `feed` method.
54 'pickle': pickle
.dumps
62 self
._stemmer
= Stemmer()
64 def dump(self
, stream
, format
):
65 """Dump the freezed index to a stream."""
66 stream
.write(self
.formats
[format
](self
.freeze()))
70 Create a useable data structure. You can pass this output
71 to the `SearchFrontend` to search the index.
74 [k
for k
, v
in sorted(self
._filenames
.items(),
76 dict(item
for item
in sorted(self
._categories
.items(),
78 [v
for k
, v
in sorted(self
._titles
.items(),
80 dict(item
for item
in sorted(self
._mapping
.items(),
84 def feed(self
, filename
, category
, title
, doctree
):
85 """Feed a doctree to the index."""
86 file_id
= self
._filenames
.setdefault(filename
, len(self
._filenames
))
87 self
._titles
[file_id
] = title
88 visitor
= WordCollector(doctree
)
90 self
._categories
.setdefault(category
, set()).add(file_id
)
91 for word
in word_re
.findall(title
) + visitor
.found_words
:
92 self
._mapping
.setdefault(self
._stemmer
.stem(word
.lower()),
96 class SearchFrontend(object):
98 This class acts as a frontend for the search index. It can search
99 a searchindex as provided by `IndexBuilder`.
102 def __init__(self
, index
):
103 self
.filenames
, self
.areas
, self
.titles
, self
.words
= index
104 self
._stemmer
= Stemmer()
106 def query(self
, required
, excluded
, areas
):
107 file_map
= defaultdict(set)
108 for word
in required
:
109 if word
not in self
.words
:
111 for fid
in self
.words
[word
]:
112 file_map
[fid
].add(word
)
114 return sorted(((self
.filenames
[fid
], self
.titles
[fid
])
115 for fid
, words
in file_map
.iteritems()
116 if len(words
) == len(required
) and
117 any(fid
in self
.areas
.get(area
, ()) for area
in areas
) and not
118 any(fid
in self
.words
.get(word
, ()) for word
in excluded
)
119 ), key
=lambda x
: x
[1].lower())
121 def search(self
, searchstring
, areas
):
124 for word
in searchstring
.split():
125 if word
.startswith('-'):
130 storage
.add(self
._stemmer
.stem(word
.lower()))
132 return self
.query(required
, excluded
, areas
)