sandbox/py-rest-doc/sphinx/search.py

   1 # -*- coding: utf-8 -*-
   2 """
   3     sphinx.search
   4     ~~~~~~~~~~~~~
   5
   6     Create a search index for offline search.
   7
   8     :copyright: 2007 by Armin Ronacher.
   9     :license: Python license.
  10 """
  11 import re
  12 import pickle
  13
  14 from collections import defaultdict
  15 from docutils.nodes import Text, NodeVisitor
  16 from .stemmer import PorterStemmer
  17 from .json import dump_json
  18
  19
  20 word_re = re.compile(r'\w+(?u)')
  21
  22
  23 class Stemmer(PorterStemmer):
  24     """
  25     All those porter stemmer implementations look hideous.
  26     make at least the stem method nicer.
  27     """
  28
  29     def stem(self, word):
  30         return PorterStemmer.stem(self, word, 0, len(word) - 1)
  31
  32
  33 class WordCollector(NodeVisitor):
  34     """
  35     A special visitor that collects words for the `IndexBuilder`.
  36     """
  37
  38     def __init__(self, document):
  39         NodeVisitor.__init__(self, document)
  40         self.found_words = []
  41
  42     def dispatch_visit(self, node):
  43         if node.__class__ is Text:
  44             self.found_words.extend(word_re.findall(node.astext()))
  45
  46
  47 class IndexBuilder(object):
  48     """
  49     Helper class that creates a searchindex based on the doctrees
  50     passed to the `feed` method.
  51     """
  52     formats = {
  53         'json':     dump_json,
  54         'pickle':   pickle.dumps
  55     }
  56
  57     def __init__(self):
  58         self._filenames = {}
  59         self._mapping = {}
  60         self._titles = {}
  61         self._categories = {}
  62         self._stemmer = Stemmer()
  63
  64     def dump(self, stream, format):
  65         """Dump the freezed index to a stream."""
  66         stream.write(self.formats[format](self.freeze()))
  67
  68     def freeze(self):
  69         """
  70         Create a useable data structure. You can pass this output
  71         to the `SearchFrontend` to search the index.
  72         """
  73         return [
  74             [k for k, v in sorted(self._filenames.items(),
  75                                   key=lambda x: x[1])],
  76             dict(item for item in sorted(self._categories.items(),
  77                                          key=lambda x: x[0])),
  78             [v for k, v in sorted(self._titles.items(),
  79                                   key=lambda x: x[0])],
  80             dict(item for item in sorted(self._mapping.items(),
  81                                          key=lambda x: x[0])),
  82         ]
  83
  84     def feed(self, filename, category, title, doctree):
  85         """Feed a doctree to the index."""
  86         file_id = self._filenames.setdefault(filename, len(self._filenames))
  87         self._titles[file_id] = title
  88         visitor = WordCollector(doctree)
  89         doctree.walk(visitor)
  90         self._categories.setdefault(category, set()).add(file_id)
  91         for word in word_re.findall(title) + visitor.found_words:
  92             self._mapping.setdefault(self._stemmer.stem(word.lower()),
  93                                      set()).add(file_id)
  94
  95
  96 class SearchFrontend(object):
  97     """
  98     This class acts as a frontend for the search index. It can search
  99     a searchindex as provided by `IndexBuilder`.
 100     """
 101
 102     def __init__(self, index):
 103         self.filenames, self.areas, self.titles, self.words = index
 104         self._stemmer = Stemmer()
 105
 106     def query(self, required, excluded, areas):
 107         file_map = defaultdict(set)
 108         for word in required:
 109             if word not in self.words:
 110                 break
 111             for fid in self.words[word]:
 112                 file_map[fid].add(word)
 113
 114         return sorted(((self.filenames[fid], self.titles[fid])
 115             for fid, words in file_map.iteritems()
 116             if len(words) == len(required) and
 117                any(fid in self.areas.get(area, ()) for area in areas) and not
 118                any(fid in self.words.get(word, ()) for word in excluded)
 119         ), key=lambda x: x[1].lower())
 120
 121     def search(self, searchstring, areas):
 122         required = set()
 123         excluded = set()
 124         for word in searchstring.split():
 125             if word.startswith('-'):
 126                 storage = excluded
 127                 word = word[1:]
 128             else:
 129                 storage = required
 130             storage.add(self._stemmer.stem(word.lower()))
 131
 132         return self.query(required, excluded, areas)